RamenDR · ShyamsundarR · Sep 18, 2024 · Sep 12, 2024 · Aug 27, 2024 · Aug 27, 2024
diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml
@@ -29,7 +29,7 @@ jobs:
 
     - name: Setup drenv
       working-directory: test
-      run: drenv setup -v
+      run: drenv setup -v envs/regional-dr.yaml
 
     - name: Install ramenctl
       run: pip install -e ramenctl
@@ -100,4 +100,4 @@ jobs:
     - name: Cleanup drenv
       if: always()
       working-directory: test
-      run: drenv cleanup -v
+      run: drenv cleanup -v envs/regional-dr.yaml
diff --git a/hack/make-venv b/hack/make-venv
@@ -27,9 +27,6 @@ cp coverage.pth $venv/lib/python*/site-packages
 echo "Adding venv symlink..."
 ln -sf $venv/bin/activate venv
 
-echo "Setting up minikube for drenv"
-$venv/bin/drenv setup -v
-
 echo
 echo "To activate the environment run:"
 echo

diff --git a/test/Makefile b/test/Makefile
@@ -50,7 +50,7 @@ coverage-html:
 	xdg-open htmlcov/index.html
 
 cluster:
-	drenv start --name-prefix $(prefix) $(env)
+	drenv start --name-prefix $(prefix) $(env) -v
 
 clean:
 	drenv delete --name-prefix $(prefix) $(env)
diff --git a/test/README.md b/test/README.md
@@ -539,9 +539,11 @@ $ drenv delete envs/example.yaml
 
 - `templates`: templates for creating new profiles.
     - `name`: profile name.
-    - `external`: true if this is existing external cluster. In this
-      case the tool will not start a minikube cluster and all other
-      options are ignored.
+    - `provider`: cluster provider. The default provider is "minikube",
+      creating cluster using VM or containers.  Use "external" to use
+      exsiting clusters not managed by `drenv`. Use the special value
+      "$provider" to select the best provider for the host. (default
+      "$provider")
     - `driver`: The minikube driver. On Linux, the default drivers are kvm2 and
       docker for VMs and containers. On MacOS, the defaults are hyperkit and
       podman. Use "$vm" and "$container" values to use the recommended VM and

diff --git a/test/drenv/__main__.py b/test/drenv/__main__.py
@@ -19,10 +19,9 @@
 from . import cache
 from . import cluster
 from . import commands
-from . import containerd
 from . import envfile
 from . import kubectl
-from . import minikube
+from . import providers
 from . import ramen
 from . import shutdown
 
@@ -114,8 +113,8 @@ def parse_args():
     add_command(sp, "dump", do_dump, help="dump an environment yaml")
 
     add_command(sp, "clear", do_clear, help="cleared cached resources", envfile=False)
-    add_command(sp, "setup", do_setup, help="setup minikube for drenv", envfile=False)
-    add_command(sp, "cleanup", do_cleanup, help="cleanup minikube", envfile=False)
+    add_command(sp, "setup", do_setup, help="setup host for drenv")
+    add_command(sp, "cleanup", do_cleanup, help="cleanup host")
 
     return parser.parse_args()
 
@@ -183,13 +182,19 @@ def handle_termination_signal(signo, frame):
 
 
 def do_setup(args):
-    logging.info("[main] Setting up minikube for drenv")
-    minikube.setup_files()
+    env = load_env(args)
+    for name in set(p["provider"] for p in env["profiles"]):
+        logging.info("[main] Setting up '%s' for drenv", name)
+        provider = providers.get(name)
+        provider.setup()
 
 
 def do_cleanup(args):
-    logging.info("[main] Cleaning up minikube")
-    minikube.cleanup_files()
+    env = load_env(args)
+    for name in set(p["provider"] for p in env["profiles"]):
+        logging.info("[main] Cleaning up '%s' for drenv", name)
+        provider = providers.get(name)
+        provider.cleanup()
 
 
 def do_clear(args):
@@ -299,14 +304,16 @@ def do_suspend(args):
     env = load_env(args)
     logging.info("[%s] Suspending environment", env["name"])
     for profile in env["profiles"]:
-        run("virsh", "-c", "qemu:///system", "suspend", profile["name"])
+        provider = providers.get(profile["provider"])
+        provider.suspend(profile)
 
 
 def do_resume(args):
     env = load_env(args)
     logging.info("[%s] Resuming environment", env["name"])
     for profile in env["profiles"]:
-        run("virsh", "-c", "qemu:///system", "resume", profile["name"])
+        provider = providers.get(profile["provider"])
+        provider.resume(profile)
 
 
 def do_dump(args):
@@ -351,18 +358,14 @@ def collect_addons(env):
 
 
 def start_cluster(profile, hooks=(), args=None, **options):
-    if profile["external"]:
-        logging.debug("[%s] Skipping external cluster", profile["name"])
-    else:
-        is_restart = minikube_profile_exists(profile["name"])
-        start_minikube_cluster(profile, verbose=args.verbose)
-        if profile["containerd"]:
-            logging.info("[%s] Configuring containerd", profile["name"])
-            containerd.configure(profile)
-        if is_restart:
-            restart_failed_deployments(profile)
-        else:
-            minikube.load_files(profile["name"])
+    provider = providers.get(profile["provider"])
+    existing = provider.exists(profile)
+
+    provider.start(profile, verbose=args.verbose)
+    provider.configure(profile, existing=existing)
+
+    if existing:
+        restart_failed_deployments(profile)
 
     if hooks:
         execute(
@@ -387,96 +390,27 @@ def stop_cluster(profile, hooks=(), **options):
             allow_failure=True,
         )
 
-    if profile["external"]:
-        logging.debug("[%s] Skipping external cluster", profile["name"])
-    elif cluster_status != cluster.UNKNOWN:
-        stop_minikube_cluster(profile)
+    if cluster_status != cluster.UNKNOWN:
+        provider = providers.get(profile["provider"])
+        provider.stop(profile)
 
 
 def delete_cluster(profile, **options):
-    if profile["external"]:
-        logging.debug("[%s] Skipping external cluster", profile["name"])
-    else:
-        delete_minikube_cluster(profile)
+    provider = providers.get(profile["provider"])
+    provider.delete(profile)
 
     profile_config = drenv.config_dir(profile["name"])
     if os.path.exists(profile_config):
         logging.info("[%s] Removing config %s", profile["name"], profile_config)
         shutil.rmtree(profile_config)
 
 
-def minikube_profile_exists(name):
-    out = minikube.profile("list", output="json")
-    profiles = json.loads(out)
-    for profile in profiles["valid"]:
-        if profile["Name"] == name:
-            return True
-    return False
-
-
-def start_minikube_cluster(profile, verbose=False):
-    start = time.monotonic()
-    logging.info("[%s] Starting minikube cluster", profile["name"])
-
-    minikube.start(
-        profile["name"],
-        driver=profile["driver"],
-        container_runtime=profile["container_runtime"],
-        extra_disks=profile["extra_disks"],
-        disk_size=profile["disk_size"],
-        network=profile["network"],
-        nodes=profile["nodes"],
-        cni=profile["cni"],
-        cpus=profile["cpus"],
-        memory=profile["memory"],
-        addons=profile["addons"],
-        service_cluster_ip_range=profile["service_cluster_ip_range"],
-        extra_config=profile["extra_config"],
-        feature_gates=profile["feature_gates"],
-        alsologtostderr=verbose,
-    )
-
-    logging.info(
-        "[%s] Cluster started in %.2f seconds",
-        profile["name"],
-        time.monotonic() - start,
-    )
-
-
-def stop_minikube_cluster(profile):
-    start = time.monotonic()
-    logging.info("[%s] Stopping cluster", profile["name"])
-    minikube.stop(profile["name"])
-    logging.info(
-        "[%s] Cluster stopped in %.2f seconds",
-        profile["name"],
-        time.monotonic() - start,
-    )
-
-
-def delete_minikube_cluster(profile):
-    start = time.monotonic()
-    logging.info("[%s] Deleting cluster", profile["name"])
-    minikube.delete(profile["name"])
-    logging.info(
-        "[%s] Cluster deleted in %.2f seconds",
-        profile["name"],
-        time.monotonic() - start,
-    )
-
-
-def restart_failed_deployments(profile, initial_wait=30):
+def restart_failed_deployments(profile):
     """
-    When restarting, kubectl can report stale status for a while, before it
-    starts to report real status. Then it takes a while until all deployments
-    become available.
-
-    We first wait for initial_wait seconds to give Kubernetes chance to fail
-    liveness and readiness checks. Then we restart for failed deployments.
+    When restarting after failure, some deployment may enter failing state.
+    This is not handled by the addons. Restarting the deployment solves this
+    issue. This may also be solved at the addon level.
     """
-    logging.info("[%s] Waiting for fresh status", profile["name"])
-    time.sleep(initial_wait)
-
     logging.info("[%s] Looking up failed deployments", profile["name"])
     debug = partial(logging.debug, f"[{profile['name']}] %s")
 

diff --git a/test/drenv/cluster.py b/test/drenv/cluster.py
@@ -5,36 +5,38 @@
 import time
 
 from . import kubectl
+from . import commands
 
 # Cluster does not have kubeconfig.
 UNKNOWN = "unknwon"
 
 # Cluster has kubeconfig.
 CONFIGURED = "configured"
 
-# APIServer is responding.
+# APIServer is ready.
 READY = "ready"
 
 
 def status(name):
     if not kubeconfig(name):
         return UNKNOWN
 
-    out = kubectl.version(context=name, output="json")
-    version_info = json.loads(out)
-    if "serverVersion" not in version_info:
+    try:
+        readyz(name)
+    except commands.Error:
         return CONFIGURED
 
     return READY
 
 
-def wait_until_ready(name, timeout=600):
+def wait_until_ready(name, timeout=600, log=print):
     """
     Wait until a cluster is ready.
 
     This is useful when starting profiles concurrently, when one profile needs
-    to wait for another profile.
+    to wait for another profile, or when restarting a stopped cluster.
     """
+    log(f"Waiting until cluster '{name}' is ready")
     deadline = time.monotonic() + timeout
     delay = min(1.0, timeout / 60)
     last_status = None
@@ -43,7 +45,7 @@ def wait_until_ready(name, timeout=600):
         current_status = status(name)
 
         if current_status != last_status:
-            print(f"Cluster '{name}' is {current_status}")
+            log(f"Cluster '{name}' is {current_status}")
             last_status = current_status
 
         if current_status == READY:
@@ -77,3 +79,14 @@ def kubeconfig(context_name):
                     return cluster
 
     return {}
+
+
+def readyz(name, verbose=False):
+    """
+    Check if API server is ready.
+    https://kubernetes.io/docs/reference/using-api/health-checks/
+    """
+    path = "/readyz"
+    if verbose:
+        path += "?verbose"
+    return kubectl.get("--raw", path, context=name)
diff --git a/test/drenv/commands.py b/test/drenv/commands.py
@@ -108,10 +108,23 @@ def run(*args, input=None, decode=True, env=None):
     return output.decode() if decode else output
 
 
-def watch(*args, input=None, keepends=False, decode=True, timeout=None, env=None):
+def watch(
+    *args,
+    input=None,
+    keepends=False,
+    decode=True,
+    timeout=None,
+    env=None,
+    stderr=subprocess.PIPE,
+):
     """
     Run command args, iterating over lines read from the child process stdout.
 
+    Some commands have no output and log everyting to stderr (like drenv). To
+    watch the output call with stderr=subprocess.STDOUT. When such command
+    fails, we have always have empty error, since the content was already
+    yielded to the caller.
+
     Assumes that the child process output UTF-8. Will raise if the command
     outputs binary data. This is not a problem in this projects since all our
     commands are text based.
@@ -144,7 +157,7 @@ def watch(*args, input=None, keepends=False, decode=True, timeout=None, env=None
                 # Avoid blocking foerver if there is no input.
                 stdin=subprocess.PIPE if input else subprocess.DEVNULL,
                 stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
+                stderr=stderr,
                 env=env,
             )
         except OSError as e:

diff --git a/test/drenv/commands_test.py b/test/drenv/commands_test.py
@@ -170,6 +170,35 @@ def test_watch_lines():
     assert output == ["line %d" % i for i in range(10)]
 
 
+def test_watch_stderr_success():
+    # Watching command like drenv, logging only to stderr without any output.
+    script = r"""
+import sys
+for i in range(10):
+    sys.stderr.write(f"line {i}\n")
+"""
+    cmd = ["python3", "-c", script]
+    output = list(commands.watch(*cmd, stderr=subprocess.STDOUT))
+    assert output == [f"line {i}" for i in range(10)]
+
+
+def test_watch_stderr_error():
+    # When stderr is redirected to stdout the error is empty.
+    script = r"""
+import sys
+sys.stderr.write("before error\n")
+sys.exit("error")
+"""
+    cmd = ["python3", "-c", script]
+    output = []
+    with pytest.raises(commands.Error) as e:
+        for line in commands.watch(*cmd, stderr=subprocess.STDOUT):
+            output.append(line)
+
+    assert output == ["before error", "error"]
+    assert e.value.error == ""
+
+
 def test_watch_partial_lines():
     script = """
 import time