From 7fa23e69f7d0cd1016745bd5762208bfe40a196d Mon Sep 17 00:00:00 2001
From: Nir Soffer <nsoffer@redhat.com>
Date: Thu, 12 Sep 2024 23:26:35 +0300
Subject: [PATCH 01/14] Add debug logs in envfile

To make it possible to debug platform and machine detection in github
and or in developer environment.

Signed-off-by: Nir Soffer <nsoffer@redhat.com>
---
 test/Makefile         | 2 +-
 test/drenv/envfile.py | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/test/Makefile b/test/Makefile
index 75ea8d9a4..7008c364c 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -50,7 +50,7 @@ coverage-html:
 	xdg-open htmlcov/index.html
 
 cluster:
-	drenv start --name-prefix $(prefix) $(env)
+	drenv start --name-prefix $(prefix) $(env) -v
 
 clean:
 	drenv delete --name-prefix $(prefix) $(env)
diff --git a/test/drenv/envfile.py b/test/drenv/envfile.py
index 403b95b15..8902b20ab 100644
--- a/test/drenv/envfile.py
+++ b/test/drenv/envfile.py
@@ -1,8 +1,9 @@
 # SPDX-FileCopyrightText: The RamenDR authors
 # SPDX-License-Identifier: Apache-2.0
 
-import os
 import copy
+import logging
+import os
 import platform
 
 import yaml
@@ -52,6 +53,7 @@ def platform_defaults():
     # By default, use minikube defaults.
 
     operating_system = platform.system().lower()
+    logging.debug("[envfile] Detected os: '%s'", operating_system)
     return _PLATFORM_DEFAULTS.get(operating_system, _PLATFORM_DEFAULTS["__default__"])
 
 
@@ -149,6 +151,7 @@ def _validate_profile(profile, addons_root):
 def _validate_platform_defaults(profile):
     platform = platform_defaults()
     machine = os.uname().machine
+    logging.debug("[envfile] Detected machine: '%s'", machine)
 
     if profile["driver"] == VM:
         profile["driver"] = platform[VM][machine]
@@ -158,6 +161,10 @@ def _validate_platform_defaults(profile):
     if profile["network"] == SHARED_NETWORK:
         profile["network"] = platform[SHARED_NETWORK][machine]
 
+    logging.debug("[envfile] Using provider: '%s'", profile["provider"])
+    logging.debug("[envfile] Using driver: '%s'", profile["driver"])
+    logging.debug("[envfile] Using network: '%s'", profile["network"])
+
 
 def _validate_worker(worker, env, addons_root, index):
     worker["name"] = f'{env["name"]}/{worker.get("name", index)}'

From f4e3782a81c84ba668fd67c655f42788dd559735 Mon Sep 17 00:00:00 2001
From: Nir Soffer <nsoffer@redhat.com>
Date: Tue, 27 Aug 2024 21:27:32 +0300
Subject: [PATCH 02/14] More clear argument names

All the public functions in the minikube module accept a profile, but
this is actually a profile name. We want to pass a profile dict to
start(). Use `name` for functions accepting a profile names.

Signed-off-by: Nir Soffer <nsoffer@redhat.com>
---
 test/drenv/minikube.py | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/test/drenv/minikube.py b/test/drenv/minikube.py
index e8ed0b2f8..d7d1c9663 100644
--- a/test/drenv/minikube.py
+++ b/test/drenv/minikube.py
@@ -29,12 +29,12 @@ def profile(command, output=None):
     return _run("profile", command, output=output)
 
 
-def status(profile, output=None):
-    return _run("status", profile=profile, output=output)
+def status(name, output=None):
+    return _run("status", profile=name, output=output)
 
 
 def start(
-    profile,
+    name,
     driver=None,
     container_runtime=None,
     extra_disks=None,
@@ -94,23 +94,23 @@ def start(
     # TODO: Use --interactive=false when the bug is fixed.
     # https://github.com/kubernetes/minikube/issues/19518
 
-    _watch("start", *args, profile=profile)
+    _watch("start", *args, profile=name)
 
 
-def stop(profile):
-    _watch("stop", profile=profile)
+def stop(name):
+    _watch("stop", profile=name)
 
 
-def delete(profile):
-    _watch("delete", profile=profile)
+def delete(name):
+    _watch("delete", profile=name)
 
 
-def cp(profile, src, dst):
-    _watch("cp", src, dst, profile=profile)
+def cp(name, src, dst):
+    _watch("cp", src, dst, profile=name)
 
 
-def ssh(profile, command):
-    _watch("ssh", command, profile=profile)
+def ssh(name, command):
+    _watch("ssh", command, profile=name)
 
 
 def setup_files():
@@ -127,7 +127,7 @@ def setup_files():
     _setup_systemd_resolved(version)
 
 
-def load_files(profile):
+def load_files(name):
     """
     Load configuration done in setup_files() before the minikube cluster was
     started.
@@ -135,8 +135,8 @@ def load_files(profile):
     Must be called after the cluster is started, before running any addon. Not
     need when starting a stopped cluster.
     """
-    _load_sysctl(profile)
-    _load_systemd_resolved(profile)
+    _load_sysctl(name)
+    _load_systemd_resolved(name)
 
 
 def cleanup_files():
@@ -178,11 +178,11 @@ def _setup_sysctl(version):
     _write_file(path, data)
 
 
-def _load_sysctl(profile):
+def _load_sysctl(name):
     if not os.path.exists(_sysctl_drenv_conf()):
         return
-    logging.debug("[%s] Loading drenv sysctl configuration", profile)
-    ssh(profile, "sudo sysctl -p /etc/sysctl.d/99-drenv.conf")
+    logging.debug("[%s] Loading drenv sysctl configuration", name)
+    ssh(name, "sudo sysctl -p /etc/sysctl.d/99-drenv.conf")
 
 
 def _sysctl_drenv_conf():
@@ -211,11 +211,11 @@ def _setup_systemd_resolved(version):
     _write_file(path, data)
 
 
-def _load_systemd_resolved(profile):
+def _load_systemd_resolved(name):
     if not os.path.exists(_systemd_resolved_drenv_conf()):
         return
-    logging.debug("[%s] Loading drenv systemd-resolved configuration", profile)
-    ssh(profile, "sudo systemctl restart systemd-resolved.service")
+    logging.debug("[%s] Loading drenv systemd-resolved configuration", name)
+    ssh(name, "sudo systemctl restart systemd-resolved.service")
 
 
 def _systemd_resolved_drenv_conf():

From 9e54879656b8d42a8a2044fce86f7700180048c3 Mon Sep 17 00:00:00 2001
From: Nir Soffer <nsoffer@redhat.com>
Date: Tue, 27 Aug 2024 21:20:00 +0300
Subject: [PATCH 03/14] Move minikube helpers to minikube module

The minikube module is mostly a thin wrapper for the minikube command,
and we have higher level helpers in __main__.py. Since we wan to have
multiple providers (e.g. lima, external), move all the helpers to the
minikube module.

Signed-off-by: Nir Soffer <nsoffer@redhat.com>
---
 test/drenv/__main__.py |  68 ++-----------------------
 test/drenv/minikube.py | 109 ++++++++++++++++++++++++-----------------
 2 files changed, 68 insertions(+), 109 deletions(-)

diff --git a/test/drenv/__main__.py b/test/drenv/__main__.py
index d67b2dd2d..bca1a937c 100644
--- a/test/drenv/__main__.py
+++ b/test/drenv/__main__.py
@@ -354,8 +354,8 @@ def start_cluster(profile, hooks=(), args=None, **options):
     if profile["external"]:
         logging.debug("[%s] Skipping external cluster", profile["name"])
     else:
-        is_restart = minikube_profile_exists(profile["name"])
-        start_minikube_cluster(profile, verbose=args.verbose)
+        is_restart = minikube.exists(profile["name"])
+        minikube.start(profile, verbose=args.verbose)
         if profile["containerd"]:
             logging.info("[%s] Configuring containerd", profile["name"])
             containerd.configure(profile)
@@ -390,14 +390,14 @@ def stop_cluster(profile, hooks=(), **options):
     if profile["external"]:
         logging.debug("[%s] Skipping external cluster", profile["name"])
     elif cluster_status != cluster.UNKNOWN:
-        stop_minikube_cluster(profile)
+        minikube.stop(profile["name"])
 
 
 def delete_cluster(profile, **options):
     if profile["external"]:
         logging.debug("[%s] Skipping external cluster", profile["name"])
     else:
-        delete_minikube_cluster(profile)
+        minikube.delete(profile["name"])
 
     profile_config = drenv.config_dir(profile["name"])
     if os.path.exists(profile_config):
@@ -405,66 +405,6 @@ def delete_cluster(profile, **options):
         shutil.rmtree(profile_config)
 
 
-def minikube_profile_exists(name):
-    out = minikube.profile("list", output="json")
-    profiles = json.loads(out)
-    for profile in profiles["valid"]:
-        if profile["Name"] == name:
-            return True
-    return False
-
-
-def start_minikube_cluster(profile, verbose=False):
-    start = time.monotonic()
-    logging.info("[%s] Starting minikube cluster", profile["name"])
-
-    minikube.start(
-        profile["name"],
-        driver=profile["driver"],
-        container_runtime=profile["container_runtime"],
-        extra_disks=profile["extra_disks"],
-        disk_size=profile["disk_size"],
-        network=profile["network"],
-        nodes=profile["nodes"],
-        cni=profile["cni"],
-        cpus=profile["cpus"],
-        memory=profile["memory"],
-        addons=profile["addons"],
-        service_cluster_ip_range=profile["service_cluster_ip_range"],
-        extra_config=profile["extra_config"],
-        feature_gates=profile["feature_gates"],
-        alsologtostderr=verbose,
-    )
-
-    logging.info(
-        "[%s] Cluster started in %.2f seconds",
-        profile["name"],
-        time.monotonic() - start,
-    )
-
-
-def stop_minikube_cluster(profile):
-    start = time.monotonic()
-    logging.info("[%s] Stopping cluster", profile["name"])
-    minikube.stop(profile["name"])
-    logging.info(
-        "[%s] Cluster stopped in %.2f seconds",
-        profile["name"],
-        time.monotonic() - start,
-    )
-
-
-def delete_minikube_cluster(profile):
-    start = time.monotonic()
-    logging.info("[%s] Deleting cluster", profile["name"])
-    minikube.delete(profile["name"])
-    logging.info(
-        "[%s] Cluster deleted in %.2f seconds",
-        profile["name"],
-        time.monotonic() - start,
-    )
-
-
 def restart_failed_deployments(profile, initial_wait=30):
     """
     When restarting, kubectl can report stale status for a while, before it
diff --git a/test/drenv/minikube.py b/test/drenv/minikube.py
index d7d1c9663..4c47f03fd 100644
--- a/test/drenv/minikube.py
+++ b/test/drenv/minikube.py
@@ -5,6 +5,7 @@
 import json
 import logging
 import os
+import time
 
 from packaging.version import Version
 
@@ -33,60 +34,57 @@ def status(name, output=None):
     return _run("status", profile=name, output=output)
 
 
-def start(
-    name,
-    driver=None,
-    container_runtime=None,
-    extra_disks=None,
-    disk_size=None,
-    network=None,
-    nodes=None,
-    cni=None,
-    cpus=None,
-    memory=None,
-    addons=(),
-    service_cluster_ip_range=None,
-    extra_config=None,
-    feature_gates=None,
-    alsologtostderr=False,
-):
+def start(profile, verbose=False):
+    start = time.monotonic()
+    logging.info("[%s] Starting minikube cluster", profile["name"])
+
     args = []
 
-    if driver:
-        args.extend(("--driver", driver))
-    if container_runtime:
-        args.extend(("--container-runtime", container_runtime))
-    if extra_disks:
-        args.extend(("--extra-disks", str(extra_disks)))
-    if disk_size:
-        args.extend(("--disk-size", disk_size))  # "4g"
-    if network:
-        args.extend(("--network", network))
-    if nodes:
-        args.extend(("--nodes", str(nodes)))
-    if cni:
-        args.extend(("--cni", cni))
-    if cpus:
-        args.extend(("--cpus", str(cpus)))
-    if memory:
-        args.extend(("--memory", memory))
-    if addons:
-        args.extend(("--addons", ",".join(addons)))
-    if service_cluster_ip_range:
-        args.extend(("--service-cluster-ip-range", service_cluster_ip_range))
+    if profile["driver"]:
+        args.extend(("--driver", profile["driver"]))
+
+    if profile["container_runtime"]:
+        args.extend(("--container-runtime", profile["container_runtime"]))
+
+    if profile["extra_disks"]:
+        args.extend(("--extra-disks", str(profile["extra_disks"])))
+
+    if profile["disk_size"]:
+        args.extend(("--disk-size", profile["disk_size"]))  # "4g"
+
+    if profile["network"]:
+        args.extend(("--network", profile["network"]))
+
+    if profile["nodes"]:
+        args.extend(("--nodes", str(profile["nodes"])))
+
+    if profile["cni"]:
+        args.extend(("--cni", profile["cni"]))
+
+    if profile["cpus"]:
+        args.extend(("--cpus", str(profile["cpus"])))
+
+    if profile["memory"]:
+        args.extend(("--memory", profile["memory"]))
+
+    if profile["addons"]:
+        args.extend(("--addons", ",".join(profile["addons"])))
+
+    if profile["service_cluster_ip_range"]:
+        args.extend(("--service-cluster-ip-range", profile["service_cluster_ip_range"]))
 
     for pair in EXTRA_CONFIG:
         args.extend(("--extra-config", pair))
 
-    if extra_config:
-        for pair in extra_config:
+    if profile["extra_config"]:
+        for pair in profile["extra_config"]:
             args.extend(("--extra-config", pair))
 
-    if feature_gates:
+    if profile["feature_gates"]:
         # Unlike --extra-config this requires one comma separated value.
-        args.extend(("--feature-gates", ",".join(feature_gates)))
+        args.extend(("--feature-gates", ",".join(profile["feature_gates"])))
 
-    if alsologtostderr:
+    if verbose:
         args.append("--alsologtostderr")
 
     args.append("--insecure-registry=host.minikube.internal:5000")
@@ -94,15 +92,27 @@ def start(
     # TODO: Use --interactive=false when the bug is fixed.
     # https://github.com/kubernetes/minikube/issues/19518
 
-    _watch("start", *args, profile=name)
+    _watch("start", *args, profile=profile["name"])
+
+    logging.info(
+        "[%s] Cluster started in %.2f seconds",
+        profile["name"],
+        time.monotonic() - start,
+    )
 
 
 def stop(name):
+    start = time.monotonic()
+    logging.info("[%s] Stopping cluster", name)
     _watch("stop", profile=name)
+    logging.info("[%s] Cluster stopped in %.2f seconds", name, time.monotonic() - start)
 
 
 def delete(name):
+    start = time.monotonic()
+    logging.info("[%s] Deleting cluster", name)
     _watch("delete", profile=name)
+    logging.info("[%s] Cluster deleted in %.2f seconds", name, time.monotonic() - start)
 
 
 def cp(name, src, dst):
@@ -113,6 +123,15 @@ def ssh(name, command):
     _watch("ssh", command, profile=name)
 
 
+def exists(name):
+    out = profile("list", output="json")
+    profiles = json.loads(out)
+    for p in profiles["valid"]:
+        if p["Name"] == name:
+            return True
+    return False
+
+
 def setup_files():
     """
     Set up minikube to work with drenv. Must be called before starting the

From eedfbbc2d5a5f39061758bac9ce23b270dbceb05 Mon Sep 17 00:00:00 2001
From: Nir Soffer <nsoffer@redhat.com>
Date: Mon, 9 Sep 2024 16:44:40 +0300
Subject: [PATCH 04/14] Remove `drenv setup` from make-venv

This step is not part of creating vnev, and will be more complicated to
do as part of creating the venv when adding providers. This must be run
manually as we do in the CI.

Signed-off-by: Nir Soffer <nsoffer@redhat.com>
---
 hack/make-venv | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/hack/make-venv b/hack/make-venv
index cc302ce38..a37ab78f7 100755
--- a/hack/make-venv
+++ b/hack/make-venv
@@ -27,9 +27,6 @@ cp coverage.pth $venv/lib/python*/site-packages
 echo "Adding venv symlink..."
 ln -sf $venv/bin/activate venv
 
-echo "Setting up minikube for drenv"
-$venv/bin/drenv setup -v
-
 echo
 echo "To activate the environment run:"
 echo

From 8569c91a6d142d0ecb8dfe1a4a298e42fa488f84 Mon Sep 17 00:00:00 2001
From: Nir Soffer <nsoffer@redhat.com>
Date: Tue, 27 Aug 2024 22:54:45 +0300
Subject: [PATCH 05/14] Introduce drenv providers

The envfile can have now a "provider" property, defaults to "$provider",
which expands to the platform default provider. The first provider is
minikube.

The setup and cleanup commands requires now an env file, since they need
to get the provider to setup.

Signed-off-by: Nir Soffer <nsoffer@redhat.com>
---
 .github/workflows/e2e.yaml             |  4 +--
 test/drenv/__main__.py                 | 35 ++++++++++++++++----------
 test/drenv/containerd.py               |  9 +++----
 test/drenv/envfile.py                  | 22 +++++++++++++---
 test/drenv/providers/__init__.py       |  8 ++++++
 test/drenv/{ => providers}/minikube.py |  2 +-
 test/setup.py                          |  5 +++-
 7 files changed, 60 insertions(+), 25 deletions(-)
 create mode 100644 test/drenv/providers/__init__.py
 rename test/drenv/{ => providers}/minikube.py (99%)

diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml
index 0d954009b..4824fc254 100644
--- a/.github/workflows/e2e.yaml
+++ b/.github/workflows/e2e.yaml
@@ -29,7 +29,7 @@ jobs:
 
     - name: Setup drenv
       working-directory: test
-      run: drenv setup -v
+      run: drenv setup -v envs/regional-dr.yaml
 
     - name: Install ramenctl
       run: pip install -e ramenctl
@@ -100,4 +100,4 @@ jobs:
     - name: Cleanup drenv
       if: always()
       working-directory: test
-      run: drenv cleanup -v
+      run: drenv cleanup -v envs/regional-dr.yaml
diff --git a/test/drenv/__main__.py b/test/drenv/__main__.py
index bca1a937c..f33eac063 100644
--- a/test/drenv/__main__.py
+++ b/test/drenv/__main__.py
@@ -22,7 +22,7 @@
 from . import containerd
 from . import envfile
 from . import kubectl
-from . import minikube
+from . import providers
 from . import ramen
 from . import shutdown
 
@@ -114,8 +114,8 @@ def parse_args():
     add_command(sp, "dump", do_dump, help="dump an environment yaml")
 
     add_command(sp, "clear", do_clear, help="cleared cached resources", envfile=False)
-    add_command(sp, "setup", do_setup, help="setup minikube for drenv", envfile=False)
-    add_command(sp, "cleanup", do_cleanup, help="cleanup minikube", envfile=False)
+    add_command(sp, "setup", do_setup, help="setup host for drenv")
+    add_command(sp, "cleanup", do_cleanup, help="cleanup host")
 
     return parser.parse_args()
 
@@ -183,13 +183,19 @@ def handle_termination_signal(signo, frame):
 
 
 def do_setup(args):
-    logging.info("[main] Setting up minikube for drenv")
-    minikube.setup_files()
+    env = load_env(args)
+    for name in set(p["provider"] for p in env["profiles"]):
+        logging.info("[main] Setting up '%s' for drenv", name)
+        provider = providers.get(name)
+        provider.setup_files()
 
 
 def do_cleanup(args):
-    logging.info("[main] Cleaning up minikube")
-    minikube.cleanup_files()
+    env = load_env(args)
+    for name in set(p["provider"] for p in env["profiles"]):
+        logging.info("[main] Cleaning up '%s' for drenv", name)
+        provider = providers.get(name)
+        provider.cleanup_files()
 
 
 def do_clear(args):
@@ -351,18 +357,19 @@ def collect_addons(env):
 
 
 def start_cluster(profile, hooks=(), args=None, **options):
+    provider = providers.get(profile["provider"])
     if profile["external"]:
         logging.debug("[%s] Skipping external cluster", profile["name"])
     else:
-        is_restart = minikube.exists(profile["name"])
-        minikube.start(profile, verbose=args.verbose)
+        is_restart = provider.exists(profile["name"])
+        provider.start(profile, verbose=args.verbose)
         if profile["containerd"]:
             logging.info("[%s] Configuring containerd", profile["name"])
-            containerd.configure(profile)
+            containerd.configure(provider, profile)
         if is_restart:
             restart_failed_deployments(profile)
         else:
-            minikube.load_files(profile["name"])
+            provider.load_files(profile["name"])
 
     if hooks:
         execute(
@@ -387,17 +394,19 @@ def stop_cluster(profile, hooks=(), **options):
             allow_failure=True,
         )
 
+    provider = providers.get(profile["provider"])
     if profile["external"]:
         logging.debug("[%s] Skipping external cluster", profile["name"])
     elif cluster_status != cluster.UNKNOWN:
-        minikube.stop(profile["name"])
+        provider.stop(profile["name"])
 
 
 def delete_cluster(profile, **options):
+    provider = providers.get(profile["provider"])
     if profile["external"]:
         logging.debug("[%s] Skipping external cluster", profile["name"])
     else:
-        minikube.delete(profile["name"])
+        provider.delete(profile["name"])
 
     profile_config = drenv.config_dir(profile["name"])
     if os.path.exists(profile_config):
diff --git a/test/drenv/containerd.py b/test/drenv/containerd.py
index 2756cb77f..1815be6f0 100644
--- a/test/drenv/containerd.py
+++ b/test/drenv/containerd.py
@@ -6,17 +6,16 @@
 
 import toml
 
-from . import minikube
 from . import patch
 
 
-def configure(profile):
+def configure(provider, profile):
     config = f"{profile['name']}:/etc/containerd/config.toml"
 
     with tempfile.TemporaryDirectory() as tmpdir:
         tmp = os.path.join(tmpdir, "config.toml")
 
-        minikube.cp(profile["name"], config, tmp)
+        provider.cp(profile["name"], config, tmp)
         with open(tmp) as f:
             old_config = toml.load(f)
 
@@ -24,6 +23,6 @@ def configure(profile):
 
         with open(tmp, "w") as f:
             toml.dump(new_config, f)
-        minikube.cp(profile["name"], tmp, config)
+        provider.cp(profile["name"], tmp, config)
 
-    minikube.ssh(profile["name"], "sudo systemctl restart containerd")
+    provider.ssh(profile["name"], "sudo systemctl restart containerd")
diff --git a/test/drenv/envfile.py b/test/drenv/envfile.py
index 8902b20ab..5b1ac192c 100644
--- a/test/drenv/envfile.py
+++ b/test/drenv/envfile.py
@@ -8,12 +8,17 @@
 
 import yaml
 
+PROVIDER = "$provider"
 VM = "$vm"
 CONTAINER = "$container"
 SHARED_NETWORK = "$network"
 
 _PLATFORM_DEFAULTS = {
     "__default__": {
+        PROVIDER: {
+            "x86_64": "",
+            "arm64": "",
+        },
         VM: {
             "x86_64": "",
             "arm64": "",
@@ -25,6 +30,10 @@
         },
     },
     "linux": {
+        PROVIDER: {
+            "x86_64": "minikube",
+            "arm64": "",
+        },
         VM: {
             "x86_64": "kvm2",
             "arm64": "",
@@ -36,6 +45,10 @@
         },
     },
     "darwin": {
+        PROVIDER: {
+            "x86_64": "minikube",
+            "arm64": "minikube",
+        },
         VM: {
             "x86_64": "hyperkit",
             "arm64": "qemu",
@@ -50,8 +63,7 @@
 
 
 def platform_defaults():
-    # By default, use minikube defaults.
-
+    # By default, use provider defaults.
     operating_system = platform.system().lower()
     logging.debug("[envfile] Detected os: '%s'", operating_system)
     return _PLATFORM_DEFAULTS.get(operating_system, _PLATFORM_DEFAULTS["__default__"])
@@ -124,7 +136,8 @@ def _validate_profile(profile, addons_root):
     # If True, this is an external cluster and we don't have to start it.
     profile.setdefault("external", False)
 
-    # Properties for minikube created cluster.
+    # Properties for drenv managed cluster.
+    profile.setdefault("provider", PROVIDER)
     profile.setdefault("driver", VM)
     profile.setdefault("container_runtime", "")
     profile.setdefault("extra_disks", 0)
@@ -153,6 +166,9 @@ def _validate_platform_defaults(profile):
     machine = os.uname().machine
     logging.debug("[envfile] Detected machine: '%s'", machine)
 
+    if profile["provider"] == PROVIDER:
+        profile["provider"] = platform[PROVIDER][machine]
+
     if profile["driver"] == VM:
         profile["driver"] = platform[VM][machine]
     elif profile["driver"] == CONTAINER:
diff --git a/test/drenv/providers/__init__.py b/test/drenv/providers/__init__.py
new file mode 100644
index 000000000..7c63a2f59
--- /dev/null
+++ b/test/drenv/providers/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: The RamenDR authors
+# SPDX-License-Identifier: Apache-2.0
+
+import importlib
+
+
+def get(name):
+    return importlib.import_module("drenv.providers." + name)
diff --git a/test/drenv/minikube.py b/test/drenv/providers/minikube.py
similarity index 99%
rename from test/drenv/minikube.py
rename to test/drenv/providers/minikube.py
index 4c47f03fd..23e43be16 100644
--- a/test/drenv/minikube.py
+++ b/test/drenv/providers/minikube.py
@@ -9,7 +9,7 @@
 
 from packaging.version import Version
 
-from . import commands
+from drenv import commands
 
 EXTRA_CONFIG = [
     # When enabled, tells the Kubelet to pull images one at a time. This slows
diff --git a/test/setup.py b/test/setup.py
index ae122d6d7..4f9e8f243 100644
--- a/test/setup.py
+++ b/test/setup.py
@@ -17,7 +17,10 @@
     long_description=long_description,
     long_description_content_type="text/markdown",
     url="https://github.com/RamenDR/ramen",
-    packages=["drenv"],
+    packages=[
+        "drenv",
+        "drenv.providers",
+    ],
     install_requires=[
         "PyYAML",
         "toml",

From 921aa767a46fe3222b3c7cab9de0b0fb660b37ae Mon Sep 17 00:00:00 2001
From: Nir Soffer <nsoffer@redhat.com>
Date: Tue, 27 Aug 2024 23:12:03 +0300
Subject: [PATCH 06/14] Make unused minikube function private

We want to minimize the provider interface to make it easier to create
new providers.

Signed-off-by: Nir Soffer <nsoffer@redhat.com>
---
 test/drenv/providers/minikube.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/test/drenv/providers/minikube.py b/test/drenv/providers/minikube.py
index 23e43be16..cb20fde7c 100644
--- a/test/drenv/providers/minikube.py
+++ b/test/drenv/providers/minikube.py
@@ -22,18 +22,6 @@
 ]
 
 
-def profile(command, output=None):
-    # Workaround for https://github.com/kubernetes/minikube/pull/16900
-    # TODO: remove when issue is fixed.
-    _create_profiles_dir()
-
-    return _run("profile", command, output=output)
-
-
-def status(name, output=None):
-    return _run("status", profile=name, output=output)
-
-
 def start(profile, verbose=False):
     start = time.monotonic()
     logging.info("[%s] Starting minikube cluster", profile["name"])
@@ -124,7 +112,7 @@ def ssh(name, command):
 
 
 def exists(name):
-    out = profile("list", output="json")
+    out = _profile("list", output="json")
     profiles = json.loads(out)
     for p in profiles["valid"]:
         if p["Name"] == name:
@@ -166,6 +154,18 @@ def cleanup_files():
     _cleanup_file(_sysctl_drenv_conf())
 
 
+def _profile(command, output=None):
+    # Workaround for https://github.com/kubernetes/minikube/pull/16900
+    # TODO: remove when issue is fixed.
+    _create_profiles_dir()
+
+    return _run("profile", command, output=output)
+
+
+def _status(name, output=None):
+    return _run("status", profile=name, output=output)
+
+
 def _version():
     """
     Get minikube version string ("v1.33.1") and return a package.version.Version

From d8f9b525e8e2d3bc708c948b4d56e10d5afc50cd Mon Sep 17 00:00:00 2001
From: Nir Soffer <nsoffer@redhat.com>
Date: Tue, 27 Aug 2024 23:40:43 +0300
Subject: [PATCH 07/14] Move suspend and resume to minikube provider

These commands are not very portable, they work only on Linux when using
minikube kvm2 driver.

Signed-off-by: Nir Soffer <nsoffer@redhat.com>
---
 test/drenv/__main__.py           |  6 ++++--
 test/drenv/providers/minikube.py | 20 ++++++++++++++++++++
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/test/drenv/__main__.py b/test/drenv/__main__.py
index f33eac063..d4adfb659 100644
--- a/test/drenv/__main__.py
+++ b/test/drenv/__main__.py
@@ -305,14 +305,16 @@ def do_suspend(args):
     env = load_env(args)
     logging.info("[%s] Suspending environment", env["name"])
     for profile in env["profiles"]:
-        run("virsh", "-c", "qemu:///system", "suspend", profile["name"])
+        provider = providers.get(profile["provider"])
+        provider.suspend(profile)
 
 
 def do_resume(args):
     env = load_env(args)
     logging.info("[%s] Resuming environment", env["name"])
     for profile in env["profiles"]:
-        run("virsh", "-c", "qemu:///system", "resume", profile["name"])
+        provider = providers.get(profile["provider"])
+        provider.resume(profile)
 
 
 def do_dump(args):
diff --git a/test/drenv/providers/minikube.py b/test/drenv/providers/minikube.py
index cb20fde7c..64b3e1de6 100644
--- a/test/drenv/providers/minikube.py
+++ b/test/drenv/providers/minikube.py
@@ -103,6 +103,26 @@ def delete(name):
     logging.info("[%s] Cluster deleted in %.2f seconds", name, time.monotonic() - start)
 
 
+def suspend(profile):
+    if profile["driver"] != "kvm2":
+        logging.warning("[%s] suspend supported only for kvm2 driver", profile["name"])
+        return
+    logging.info("[%s] Suspending cluster", profile["name"])
+    cmd = ["virsh", "-c", "qemu:///system", "suspend", profile["name"]]
+    for line in commands.watch(*cmd):
+        logging.debug("[%s] %s", profile["name"], line)
+
+
+def resume(profile):
+    if profile["driver"] != "kvm2":
+        logging.warning("[%s] resume supported only for kvm2 driver", profile["name"])
+        return
+    logging.info("[%s] Resuming cluster", profile["name"])
+    cmd = ["virsh", "-c", "qemu:///system", "resume", profile["name"]]
+    for line in commands.watch(*cmd):
+        logging.debug("[%s] %s", profile["name"], line)
+
+
 def cp(name, src, dst):
     _watch("cp", src, dst, profile=name)
 

From 6d2d379c3db76dd27e7bf650799eb60091f8993e Mon Sep 17 00:00:00 2001
From: Nir Soffer <nsoffer@redhat.com>
Date: Wed, 28 Aug 2024 00:10:10 +0300
Subject: [PATCH 08/14] Improve provider interface

All functions used in drenv/__main__.py pass now the profile dict
instead of the name. This is required for some functions like suspend
and resume, since these operations are available only on certain profile
driver.

The load_files() function was renamed to configure(). The function also
accepts the profile so the provider can configure the cluster based on
the cluster configuration. This will be useful to configure containerd
later.

The setup_files() and cleanup_files() were renamed to setup() and
cleanup(). They do not accept a profile since they are called once per
provider.

Functions are grouped by type: provider scope, cluster scope, and
private helpers.

Signed-off-by: Nir Soffer <nsoffer@redhat.com>
---
 test/drenv/__main__.py           |  12 ++--
 test/drenv/providers/minikube.py | 119 ++++++++++++++++++-------------
 2 files changed, 74 insertions(+), 57 deletions(-)

diff --git a/test/drenv/__main__.py b/test/drenv/__main__.py
index d4adfb659..6cf5b81cd 100644
--- a/test/drenv/__main__.py
+++ b/test/drenv/__main__.py
@@ -187,7 +187,7 @@ def do_setup(args):
     for name in set(p["provider"] for p in env["profiles"]):
         logging.info("[main] Setting up '%s' for drenv", name)
         provider = providers.get(name)
-        provider.setup_files()
+        provider.setup()
 
 
 def do_cleanup(args):
@@ -195,7 +195,7 @@ def do_cleanup(args):
     for name in set(p["provider"] for p in env["profiles"]):
         logging.info("[main] Cleaning up '%s' for drenv", name)
         provider = providers.get(name)
-        provider.cleanup_files()
+        provider.cleanup()
 
 
 def do_clear(args):
@@ -363,7 +363,7 @@ def start_cluster(profile, hooks=(), args=None, **options):
     if profile["external"]:
         logging.debug("[%s] Skipping external cluster", profile["name"])
     else:
-        is_restart = provider.exists(profile["name"])
+        is_restart = provider.exists(profile)
         provider.start(profile, verbose=args.verbose)
         if profile["containerd"]:
             logging.info("[%s] Configuring containerd", profile["name"])
@@ -371,7 +371,7 @@ def start_cluster(profile, hooks=(), args=None, **options):
         if is_restart:
             restart_failed_deployments(profile)
         else:
-            provider.load_files(profile["name"])
+            provider.configure(profile)
 
     if hooks:
         execute(
@@ -400,7 +400,7 @@ def stop_cluster(profile, hooks=(), **options):
     if profile["external"]:
         logging.debug("[%s] Skipping external cluster", profile["name"])
     elif cluster_status != cluster.UNKNOWN:
-        provider.stop(profile["name"])
+        provider.stop(profile)
 
 
 def delete_cluster(profile, **options):
@@ -408,7 +408,7 @@ def delete_cluster(profile, **options):
     if profile["external"]:
         logging.debug("[%s] Skipping external cluster", profile["name"])
     else:
-        provider.delete(profile["name"])
+        provider.delete(profile)
 
     profile_config = drenv.config_dir(profile["name"])
     if os.path.exists(profile_config):
diff --git a/test/drenv/providers/minikube.py b/test/drenv/providers/minikube.py
index 64b3e1de6..83b9efd81 100644
--- a/test/drenv/providers/minikube.py
+++ b/test/drenv/providers/minikube.py
@@ -22,6 +22,43 @@
 ]
 
 
+# Provider scope
+
+
+def setup():
+    """
+    Set up minikube to work with drenv. Must be called before starting the
+    first cluster.
+
+    To load the configuration you must call configure() after a cluster is
+    started.
+    """
+    version = _version()
+    logging.debug("[minikube] Using minikube version %s", version)
+    _setup_sysctl(version)
+    _setup_systemd_resolved(version)
+
+
+def cleanup():
+    """
+    Cleanup files added by setup().
+    """
+    _cleanup_file(_systemd_resolved_drenv_conf())
+    _cleanup_file(_sysctl_drenv_conf())
+
+
+# Cluster scope
+
+
+def exists(profile):
+    out = _profile("list", output="json")
+    profiles = json.loads(out)
+    for p in profiles["valid"]:
+        if p["Name"] == profile["name"]:
+            return True
+    return False
+
+
 def start(profile, verbose=False):
     start = time.monotonic()
     logging.info("[%s] Starting minikube cluster", profile["name"])
@@ -89,18 +126,38 @@ def start(profile, verbose=False):
     )
 
 
-def stop(name):
+def configure(profile):
+    """
+    Load configuration done in setup() before the minikube cluster was
+    started.
+
+    Must be called after the cluster is started, before running any addon. Not
+    needed when starting a stopped cluster.
+    """
+    _configure_sysctl(profile["name"])
+    _configure_systemd_resolved(profile["name"])
+
+
+def stop(profile):
     start = time.monotonic()
-    logging.info("[%s] Stopping cluster", name)
-    _watch("stop", profile=name)
-    logging.info("[%s] Cluster stopped in %.2f seconds", name, time.monotonic() - start)
+    logging.info("[%s] Stopping cluster", profile["name"])
+    _watch("stop", profile=profile["name"])
+    logging.info(
+        "[%s] Cluster stopped in %.2f seconds",
+        profile["name"],
+        time.monotonic() - start,
+    )
 
 
-def delete(name):
+def delete(profile):
     start = time.monotonic()
-    logging.info("[%s] Deleting cluster", name)
-    _watch("delete", profile=name)
-    logging.info("[%s] Cluster deleted in %.2f seconds", name, time.monotonic() - start)
+    logging.info("[%s] Deleting cluster", profile["name"])
+    _watch("delete", profile=profile["name"])
+    logging.info(
+        "[%s] Cluster deleted in %.2f seconds",
+        profile["name"],
+        time.monotonic() - start,
+    )
 
 
 def suspend(profile):
@@ -131,47 +188,7 @@ def ssh(name, command):
     _watch("ssh", command, profile=name)
 
 
-def exists(name):
-    out = _profile("list", output="json")
-    profiles = json.loads(out)
-    for p in profiles["valid"]:
-        if p["Name"] == name:
-            return True
-    return False
-
-
-def setup_files():
-    """
-    Set up minikube to work with drenv. Must be called before starting the
-    first cluster.
-
-    To load the configuration you must call load_files() after a cluster is
-    created.
-    """
-    version = _version()
-    logging.debug("[minikube] Using minikube version %s", version)
-    _setup_sysctl(version)
-    _setup_systemd_resolved(version)
-
-
-def load_files(name):
-    """
-    Load configuration done in setup_files() before the minikube cluster was
-    started.
-
-    Must be called after the cluster is started, before running any addon. Not
-    need when starting a stopped cluster.
-    """
-    _load_sysctl(name)
-    _load_systemd_resolved(name)
-
-
-def cleanup_files():
-    """
-    Cleanup files added by setup_files().
-    """
-    _cleanup_file(_systemd_resolved_drenv_conf())
-    _cleanup_file(_sysctl_drenv_conf())
+# Private helpers
 
 
 def _profile(command, output=None):
@@ -217,7 +234,7 @@ def _setup_sysctl(version):
     _write_file(path, data)
 
 
-def _load_sysctl(name):
+def _configure_sysctl(name):
     if not os.path.exists(_sysctl_drenv_conf()):
         return
     logging.debug("[%s] Loading drenv sysctl configuration", name)
@@ -250,7 +267,7 @@ def _setup_systemd_resolved(version):
     _write_file(path, data)
 
 
-def _load_systemd_resolved(name):
+def _configure_systemd_resolved(name):
     if not os.path.exists(_systemd_resolved_drenv_conf()):
         return
     logging.debug("[%s] Loading drenv systemd-resolved configuration", name)

From 4bacac24d86df26a82b950b915dcf6c5130df9e1 Mon Sep 17 00:00:00 2001
From: Nir Soffer <nsoffer@redhat.com>
Date: Wed, 28 Aug 2024 00:39:07 +0300
Subject: [PATCH 09/14] Move containerd configuration to minikube.configure()

This makes the start flow more generic, and allow every provider to do
the right thing for the profile and cluster status.

Signed-off-by: Nir Soffer <nsoffer@redhat.com>
---
 test/drenv/__main__.py           | 11 +++--------
 test/drenv/providers/minikube.py | 15 ++++++++++-----
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/test/drenv/__main__.py b/test/drenv/__main__.py
index 6cf5b81cd..bda8a3b24 100644
--- a/test/drenv/__main__.py
+++ b/test/drenv/__main__.py
@@ -19,7 +19,6 @@
 from . import cache
 from . import cluster
 from . import commands
-from . import containerd
 from . import envfile
 from . import kubectl
 from . import providers
@@ -363,15 +362,11 @@ def start_cluster(profile, hooks=(), args=None, **options):
     if profile["external"]:
         logging.debug("[%s] Skipping external cluster", profile["name"])
     else:
-        is_restart = provider.exists(profile)
+        existing = provider.exists(profile)
         provider.start(profile, verbose=args.verbose)
-        if profile["containerd"]:
-            logging.info("[%s] Configuring containerd", profile["name"])
-            containerd.configure(provider, profile)
-        if is_restart:
+        provider.configure(profile, existing=existing)
+        if existing:
             restart_failed_deployments(profile)
-        else:
-            provider.configure(profile)
 
     if hooks:
         execute(
diff --git a/test/drenv/providers/minikube.py b/test/drenv/providers/minikube.py
index 83b9efd81..b17f0e71e 100644
--- a/test/drenv/providers/minikube.py
+++ b/test/drenv/providers/minikube.py
@@ -5,11 +5,13 @@
 import json
 import logging
 import os
+import sys
 import time
 
 from packaging.version import Version
 
 from drenv import commands
+from drenv import containerd
 
 EXTRA_CONFIG = [
     # When enabled, tells the Kubelet to pull images one at a time. This slows
@@ -126,16 +128,19 @@ def start(profile, verbose=False):
     )
 
 
-def configure(profile):
+def configure(profile, existing=False):
     """
     Load configuration done in setup() before the minikube cluster was
     started.
 
-    Must be called after the cluster is started, before running any addon. Not
-    needed when starting a stopped cluster.
+    Must be called after the cluster is started, before running any addon.
     """
-    _configure_sysctl(profile["name"])
-    _configure_systemd_resolved(profile["name"])
+    if not existing:
+        if profile["containerd"]:
+            logging.info("[%s] Configuring containerd", profile["name"])
+            containerd.configure(sys.modules[__name__], profile)
+        _configure_sysctl(profile["name"])
+        _configure_systemd_resolved(profile["name"])
 
 
 def stop(profile):

From b9df788998f9aa6fa9b2141373488c7e9dfed77e Mon Sep 17 00:00:00 2001
From: Nir Soffer <nsoffer@redhat.com>
Date: Wed, 28 Aug 2024 20:35:58 +0300
Subject: [PATCH 10/14] Move waiting for fresh status to minikube

This is a specific minikube workaround - when starting an existing
cluster, kubernetes reports stale state for a while. The wait is not
needed for external cluster which we never restart. If this will be
needed for other provider we can extract a common helper later.

Signed-off-by: Nir Soffer <nsoffer@redhat.com>
---
 test/drenv/__main__.py           | 14 ++++----------
 test/drenv/providers/minikube.py | 16 ++++++++++++++++
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/test/drenv/__main__.py b/test/drenv/__main__.py
index bda8a3b24..9da2de492 100644
--- a/test/drenv/__main__.py
+++ b/test/drenv/__main__.py
@@ -411,18 +411,12 @@ def delete_cluster(profile, **options):
         shutil.rmtree(profile_config)
 
 
-def restart_failed_deployments(profile, initial_wait=30):
+def restart_failed_deployments(profile):
     """
-    When restarting, kubectl can report stale status for a while, before it
-    starts to report real status. Then it takes a while until all deployments
-    become available.
-
-    We first wait for initial_wait seconds to give Kubernetes chance to fail
-    liveness and readiness checks. Then we restart for failed deployments.
+    When restarting after failure, some deployment may enter failing state.
+    This is not handled by the addons. Restarting the deployment solves this
+    issue. This may also be solved at the addon level.
     """
-    logging.info("[%s] Waiting for fresh status", profile["name"])
-    time.sleep(initial_wait)
-
     logging.info("[%s] Looking up failed deployments", profile["name"])
     debug = partial(logging.debug, f"[{profile['name']}] %s")
 
diff --git a/test/drenv/providers/minikube.py b/test/drenv/providers/minikube.py
index b17f0e71e..510f3af93 100644
--- a/test/drenv/providers/minikube.py
+++ b/test/drenv/providers/minikube.py
@@ -142,6 +142,9 @@ def configure(profile, existing=False):
         _configure_sysctl(profile["name"])
         _configure_systemd_resolved(profile["name"])
 
+    if existing:
+        _wait_for_fresh_status(profile)
+
 
 def stop(profile):
     start = time.monotonic()
@@ -196,6 +199,19 @@ def ssh(name, command):
 # Private helpers
 
 
+def _wait_for_fresh_status(profile):
+    """
+    When starting an existing cluster, kubectl can report stale status for a
+    while, before it starts to report real status. Then it takes a while until
+    all deployments become available.
+
+    We wait 30 seconds to give Kubernetes chance to fail liveness and readiness
+    checks and start reporting real cluster status.
+    """
+    logging.info("[%s] Waiting for fresh status", profile["name"])
+    time.sleep(30)
+
+
 def _profile(command, output=None):
     # Workaround for https://github.com/kubernetes/minikube/pull/16900
     # TODO: remove when issue is fixed.

From 2cbca308947869ef4f842c5eb6c5f9dfa2f419c5 Mon Sep 17 00:00:00 2001
From: Nir Soffer <nsoffer@redhat.com>
Date: Thu, 29 Aug 2024 17:34:34 +0300
Subject: [PATCH 11/14] Support watching stderr

Some commands (like drenv) log to stderr without writing anything to
stdout. When we watch such commands we want to watch stderr instead of
stdout. Since the command do not write anything to stdout, we can
redirect the command stderr to stdout.

When a command fails, we cannot report the error message since it was
already yielded to the code watching the command. This is the issue with
logging everything to stderr, but we don't control the commands we run.

This change add an option to redirect stderr to commands.watch() and
and test the behavior.

Signed-off-by: Nir Soffer <nsoffer@redhat.com>
---
 test/drenv/commands.py      | 17 +++++++++++++++--
 test/drenv/commands_test.py | 29 +++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/test/drenv/commands.py b/test/drenv/commands.py
index af35538ba..2697e110a 100644
--- a/test/drenv/commands.py
+++ b/test/drenv/commands.py
@@ -108,10 +108,23 @@ def run(*args, input=None, decode=True, env=None):
     return output.decode() if decode else output
 
 
-def watch(*args, input=None, keepends=False, decode=True, timeout=None, env=None):
+def watch(
+    *args,
+    input=None,
+    keepends=False,
+    decode=True,
+    timeout=None,
+    env=None,
+    stderr=subprocess.PIPE,
+):
     """
     Run command args, iterating over lines read from the child process stdout.
 
+    Some commands have no output and log everyting to stderr (like drenv). To
+    watch the output call with stderr=subprocess.STDOUT. When such command
+    fails, we have always have empty error, since the content was already
+    yielded to the caller.
+
     Assumes that the child process output UTF-8. Will raise if the command
     outputs binary data. This is not a problem in this projects since all our
     commands are text based.
@@ -144,7 +157,7 @@ def watch(*args, input=None, keepends=False, decode=True, timeout=None, env=None
                 # Avoid blocking foerver if there is no input.
                 stdin=subprocess.PIPE if input else subprocess.DEVNULL,
                 stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
+                stderr=stderr,
                 env=env,
             )
         except OSError as e:
diff --git a/test/drenv/commands_test.py b/test/drenv/commands_test.py
index 9b9bcada6..88b419cbc 100644
--- a/test/drenv/commands_test.py
+++ b/test/drenv/commands_test.py
@@ -170,6 +170,35 @@ def test_watch_lines():
     assert output == ["line %d" % i for i in range(10)]
 
 
+def test_watch_stderr_success():
+    # Watching command like drenv, logging only to stderr without any output.
+    script = r"""
+import sys
+for i in range(10):
+    sys.stderr.write(f"line {i}\n")
+"""
+    cmd = ["python3", "-c", script]
+    output = list(commands.watch(*cmd, stderr=subprocess.STDOUT))
+    assert output == [f"line {i}" for i in range(10)]
+
+
+def test_watch_stderr_error():
+    # When stderr is redirected to stdout the error is empty.
+    script = r"""
+import sys
+sys.stderr.write("before error\n")
+sys.exit("error")
+"""
+    cmd = ["python3", "-c", script]
+    output = []
+    with pytest.raises(commands.Error) as e:
+        for line in commands.watch(*cmd, stderr=subprocess.STDOUT):
+            output.append(line)
+
+    assert output == ["before error", "error"]
+    assert e.value.error == ""
+
+
 def test_watch_partial_lines():
     script = """
 import time

From 9afeed70a9ff5e4a9940f5d299e56e0adff4dcb5 Mon Sep 17 00:00:00 2001
From: Nir Soffer <nsoffer@redhat.com>
Date: Wed, 4 Sep 2024 20:34:47 +0300
Subject: [PATCH 12/14] Log drenv log in tests

Like limactl, drenv logs only to stderr, so when running it in tests
with:

    commands.run("drenv", "start", ...)

we don't see anything in the test logs. If the test is blocking for long
time, we have no way to debug this. The helpful log lines are buffered
in the command error buffer.

Use the new stderr= argument to watch and log the command output, and
add helpers for drenv in a consistent way.

Signed-off-by: Nir Soffer <nsoffer@redhat.com>
---
 test/drenv/drenv_test.py | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/test/drenv/drenv_test.py b/test/drenv/drenv_test.py
index 9454c323f..dd3c1203b 100644
--- a/test/drenv/drenv_test.py
+++ b/test/drenv/drenv_test.py
@@ -2,7 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import json
+import logging
 import os
+import subprocess
 
 import yaml
 import pytest
@@ -19,23 +21,23 @@
 def test_start_unknown():
     # Cluster does not exists, so it should fail.
     with pytest.raises(commands.Error):
-        commands.run("drenv", "start", "--name-prefix", "unknown-", EXTERNAL_ENV)
+        watch("drenv", "start", "--name-prefix", "unknown-", EXTERNAL_ENV, "--verbose")
 
 
 def test_start(tmpenv):
-    commands.run("drenv", "start", "--name-prefix", tmpenv.prefix, EXTERNAL_ENV)
+    watch("drenv", "start", "--name-prefix", tmpenv.prefix, EXTERNAL_ENV, "--verbose")
     assert cluster.status(tmpenv.prefix + "cluster") == cluster.READY
 
 
 def test_dump_without_prefix():
-    out = commands.run("drenv", "dump", EXAMPLE_ENV)
+    out = run("drenv", "dump", EXAMPLE_ENV)
     dump = yaml.safe_load(out)
     assert dump["profiles"][0]["name"] == "ex1"
     assert dump["profiles"][1]["name"] == "ex2"
 
 
 def test_dump_with_prefix():
-    out = commands.run("drenv", "dump", "--name-prefix", "test-", EXAMPLE_ENV)
+    out = run("drenv", "dump", "--name-prefix", "test-", EXAMPLE_ENV)
     dump = yaml.safe_load(out)
     assert dump["profiles"][0]["name"] == "test-ex1"
     assert dump["profiles"][1]["name"] == "test-ex2"
@@ -43,23 +45,23 @@ def test_dump_with_prefix():
 
 def test_stop_unknown():
     # Does nothing, so should succeed.
-    commands.run("drenv", "stop", "--name-prefix", "unknown-", EXTERNAL_ENV)
+    run("drenv", "stop", "--name-prefix", "unknown-", EXTERNAL_ENV)
 
 
 def test_stop(tmpenv):
     # Stop does nothing, so cluster must be ready.
-    commands.run("drenv", "stop", "--name-prefix", tmpenv.prefix, EXTERNAL_ENV)
+    run("drenv", "stop", "--name-prefix", tmpenv.prefix, EXTERNAL_ENV)
     assert cluster.status(tmpenv.prefix + "cluster") == cluster.READY
 
 
 def test_delete_unknown():
     # Does nothing, so should succeed.
-    commands.run("drenv", "delete", "--name-prefix", "unknown-", EXTERNAL_ENV)
+    run("drenv", "delete", "--name-prefix", "unknown-", EXTERNAL_ENV)
 
 
 def test_delete(tmpenv):
     # Delete does nothing, so cluster must be ready.
-    commands.run("drenv", "delete", "--name-prefix", tmpenv.prefix, EXTERNAL_ENV)
+    run("drenv", "delete", "--name-prefix", tmpenv.prefix, EXTERNAL_ENV)
     assert cluster.status(tmpenv.prefix + "cluster") == cluster.READY
 
 
@@ -76,7 +78,7 @@ def test_missing_addon(tmpdir):
     path = tmpdir.join("missing-addon.yaml")
     path.write(content)
     with pytest.raises(commands.Error):
-        commands.run("drenv", "start", str(path))
+        run("drenv", "start", str(path))
 
 
 def test_kustomization(tmpdir):
@@ -153,3 +155,12 @@ def get_config(context=None, kubeconfig=None):
         args.append(f"--kubeconfig={kubeconfig}")
     out = kubectl.config(*args, context=context)
     return json.loads(out)
+
+
+def run(*args):
+    return commands.run(*args)
+
+
+def watch(*args):
+    for line in commands.watch(*args, stderr=subprocess.STDOUT):
+        logging.debug("%s", line)

From 2d7459c7c52f732ff424cc7fb3537cfbb7b3f65b Mon Sep 17 00:00:00 2001
From: Nir Soffer <nsoffer@redhat.com>
Date: Thu, 5 Sep 2024 18:11:36 +0300
Subject: [PATCH 13/14] Use /readyz endpoint for ready check

We used `kubectl version` as a proxy for cluster readynes, checking for
server info in the response. Replace the check with lower level check if
API server /readyz endpoint.

Signed-off-by: Nir Soffer <nsoffer@redhat.com>
---
 test/drenv/cluster.py | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/test/drenv/cluster.py b/test/drenv/cluster.py
index 4afe546bc..aca12b3c3 100644
--- a/test/drenv/cluster.py
+++ b/test/drenv/cluster.py
@@ -5,6 +5,7 @@
 import time
 
 from . import kubectl
+from . import commands
 
 # Cluster does not have kubeconfig.
 UNKNOWN = "unknwon"
@@ -12,7 +13,7 @@
 # Cluster has kubeconfig.
 CONFIGURED = "configured"
 
-# APIServer is responding.
+# APIServer is ready.
 READY = "ready"
 
 
@@ -20,21 +21,22 @@ def status(name):
     if not kubeconfig(name):
         return UNKNOWN
 
-    out = kubectl.version(context=name, output="json")
-    version_info = json.loads(out)
-    if "serverVersion" not in version_info:
+    try:
+        readyz(name)
+    except commands.Error:
         return CONFIGURED
 
     return READY
 
 
-def wait_until_ready(name, timeout=600):
+def wait_until_ready(name, timeout=600, log=print):
     """
     Wait until a cluster is ready.
 
     This is useful when starting profiles concurrently, when one profile needs
-    to wait for another profile.
+    to wait for another profile, or when restarting a stopped cluster.
     """
+    log(f"Waiting until cluster '{name}' is ready")
     deadline = time.monotonic() + timeout
     delay = min(1.0, timeout / 60)
     last_status = None
@@ -43,7 +45,7 @@ def wait_until_ready(name, timeout=600):
         current_status = status(name)
 
         if current_status != last_status:
-            print(f"Cluster '{name}' is {current_status}")
+            log(f"Cluster '{name}' is {current_status}")
             last_status = current_status
 
         if current_status == READY:
@@ -77,3 +79,14 @@ def kubeconfig(context_name):
                     return cluster
 
     return {}
+
+
+def readyz(name, verbose=False):
+    """
+    Check if API server is ready.
+    https://kubernetes.io/docs/reference/using-api/health-checks/
+    """
+    path = "/readyz"
+    if verbose:
+        path += "?verbose"
+    return kubectl.get("--raw", path, context=name)

From eaba42ccf0715ad0dacf830d16a1c35d017bd6af Mon Sep 17 00:00:00 2001
From: Nir Soffer <nsoffer@redhat.com>
Date: Wed, 28 Aug 2024 15:59:19 +0300
Subject: [PATCH 14/14] Add external provider

Before this change we had only one provider, so this was internal
implementation detail. This change adds the second provider, allowing
users to configure the provider in the environment file.

Replace the `external: true` option with `provider: external`. With this
we can remove the special handling or external cluster with calls to the
external provider which does the right thing.

The external provider basically does nothing, since we do not manage
this cluster. However in start() we ensure that the cluster exists and
then wait until the cluster is ready. This helps to debug issues with
external cluster and reduces log noise.

Signed-off-by: Nir Soffer <nsoffer@redhat.com>
---
 test/README.md                   |  8 ++--
 test/drenv/__main__.py           | 26 +++++-------
 test/drenv/providers/external.py | 73 ++++++++++++++++++++++++++++++++
 test/envs/external.yaml          |  8 ++--
 4 files changed, 92 insertions(+), 23 deletions(-)
 create mode 100644 test/drenv/providers/external.py

diff --git a/test/README.md b/test/README.md
index 6114581b0..3ecfca624 100644
--- a/test/README.md
+++ b/test/README.md
@@ -539,9 +539,11 @@ $ drenv delete envs/example.yaml
 
 - `templates`: templates for creating new profiles.
     - `name`: profile name.
-    - `external`: true if this is existing external cluster. In this
-      case the tool will not start a minikube cluster and all other
-      options are ignored.
+    - `provider`: cluster provider. The default provider is "minikube",
+      creating cluster using VM or containers.  Use "external" to use
+      exsiting clusters not managed by `drenv`. Use the special value
+      "$provider" to select the best provider for the host. (default
+      "$provider")
     - `driver`: The minikube driver. On Linux, the default drivers are kvm2 and
       docker for VMs and containers. On MacOS, the defaults are hyperkit and
       podman. Use "$vm" and "$container" values to use the recommended VM and
diff --git a/test/drenv/__main__.py b/test/drenv/__main__.py
index 9da2de492..69681e6c3 100644
--- a/test/drenv/__main__.py
+++ b/test/drenv/__main__.py
@@ -359,14 +359,13 @@ def collect_addons(env):
 
 def start_cluster(profile, hooks=(), args=None, **options):
     provider = providers.get(profile["provider"])
-    if profile["external"]:
-        logging.debug("[%s] Skipping external cluster", profile["name"])
-    else:
-        existing = provider.exists(profile)
-        provider.start(profile, verbose=args.verbose)
-        provider.configure(profile, existing=existing)
-        if existing:
-            restart_failed_deployments(profile)
+    existing = provider.exists(profile)
+
+    provider.start(profile, verbose=args.verbose)
+    provider.configure(profile, existing=existing)
+
+    if existing:
+        restart_failed_deployments(profile)
 
     if hooks:
         execute(
@@ -391,19 +390,14 @@ def stop_cluster(profile, hooks=(), **options):
             allow_failure=True,
         )
 
-    provider = providers.get(profile["provider"])
-    if profile["external"]:
-        logging.debug("[%s] Skipping external cluster", profile["name"])
-    elif cluster_status != cluster.UNKNOWN:
+    if cluster_status != cluster.UNKNOWN:
+        provider = providers.get(profile["provider"])
         provider.stop(profile)
 
 
 def delete_cluster(profile, **options):
     provider = providers.get(profile["provider"])
-    if profile["external"]:
-        logging.debug("[%s] Skipping external cluster", profile["name"])
-    else:
-        provider.delete(profile)
+    provider.delete(profile)
 
     profile_config = drenv.config_dir(profile["name"])
     if os.path.exists(profile_config):
diff --git a/test/drenv/providers/external.py b/test/drenv/providers/external.py
new file mode 100644
index 000000000..a528e3546
--- /dev/null
+++ b/test/drenv/providers/external.py
@@ -0,0 +1,73 @@
+# SPDX-FileCopyrightText: The RamenDR authors
+# SPDX-License-Identifier: Apache-2.0
+
+import logging
+import time
+from functools import partial
+
+from drenv import cluster
+
+# Provider scope
+
+
+def setup():
+    logging.info("[external] Skipping setup for external provider")
+
+
+def cleanup():
+    logging.info("[external] Skipping cleanup for external provider")
+
+
+# Cluster scope
+
+
+def exists(profile):
+    return True
+
+
+def start(profile, verbose=False):
+    start = time.monotonic()
+    logging.info("[%s] Checking external cluster status", profile["name"])
+
+    # Fail fast if cluster is not configured, we cannot recover from this.
+    status = cluster.status(profile["name"])
+    if status == cluster.UNKNOWN:
+        raise RuntimeError(f"Cluster '{profile['name']}' does not exist")
+
+    # Otherwise handle temporary outage gracefuly.
+    debug = partial(logging.debug, f"[{profile['name']}] %s")
+    cluster.wait_until_ready(profile["name"], timeout=60, log=debug)
+
+    logging.info(
+        "[%s] Cluster ready in %.2f seconds",
+        profile["name"],
+        time.monotonic() - start,
+    )
+
+
+def configure(profile, existing=False):
+    logging.info("[%s] Skipping configure for external cluster", profile["name"])
+
+
+def stop(profile):
+    logging.info("[%s] Skipping stop for external cluster", profile["name"])
+
+
+def delete(profile):
+    logging.info("[%s] Skipping delete for external cluster", profile["name"])
+
+
+def suspend(profile):
+    logging.info("[%s] Skipping suspend for external cluster", profile["name"])
+
+
+def resume(profile):
+    logging.info("[%s] Skipping resume for external cluster", profile["name"])
+
+
+def cp(name, src, dst):
+    logging.warning("[%s] cp not implemented yet for external cluster", name)
+
+
+def ssh(name, command):
+    logging.warning("[%s] ssh not implemented yet for external cluster", name)
diff --git a/test/envs/external.yaml b/test/envs/external.yaml
index 51da25131..98238dd99 100644
--- a/test/envs/external.yaml
+++ b/test/envs/external.yaml
@@ -1,12 +1,12 @@
 # SPDX-FileCopyrightText: The RamenDR authors
 # SPDX-License-Identifier: Apache-2.0
 
-# Example environment using external clusters. The cluster `test` must exist
-# when this environment is started.
+# Example environment using external clusters. The cluster must exist when this
+# environment is started.
 #
 # To try this example, create the cluster with:
 #
-#     drenv start envs/test.yaml
+#     drenv start envs/vm.yaml
 #
 # Now you can start this environment with:
 #
@@ -20,7 +20,7 @@
 name: external
 profiles:
   - name: cluster
-    external: true
+    provider: external
     workers:
       - addons:
           - name: example