From f924e0479c5b9400d7c63a8c2af21d362baa1b16 Mon Sep 17 00:00:00 2001
From: superstar54 <xingwang1991@gmail.com>
Date: Mon, 2 Sep 2024 16:53:17 +0200
Subject: [PATCH] Support multiple scheduler.

1) can run multiple runner (daemon) for the scheduler, each runner will listen to the `scheduler_queue`, and the prefetch_count is set to 1, thus each runner can only launch one Scheduler process.
2) The scheduler process listen to the `workgraph_queue` to launch WorkGraph
3) the scheduler recieve rpc call to launch WorkGrpah
4) user can submit workgraph to the workgraph queue, or select the shceduler to run it by pk
---
 aiida_workgraph/cli/cmd_scheduler.py          |  15 +-
 aiida_workgraph/engine/launch.py              |  77 -------
 aiida_workgraph/engine/override.py            |  65 ++++++
 aiida_workgraph/engine/scheduler/client.py    | 108 +++++++++-
 aiida_workgraph/engine/scheduler/scheduler.py |   8 +-
 aiida_workgraph/utils/control.py              |  22 +-
 aiida_workgraph/workgraph.py                  |  30 +--
 docs/source/howto/scheduler.ipynb             | 198 +++++++++++-------
 tests/test_scheduler.py                       |   3 +-
 9 files changed, 330 insertions(+), 196 deletions(-)
 create mode 100644 aiida_workgraph/engine/override.py

diff --git a/aiida_workgraph/cli/cmd_scheduler.py b/aiida_workgraph/cli/cmd_scheduler.py
index d8c52d09..2e388d18 100644
--- a/aiida_workgraph/cli/cmd_scheduler.py
+++ b/aiida_workgraph/cli/cmd_scheduler.py
@@ -2,6 +2,7 @@
 import click
 from pathlib import Path
 from aiida.cmdline.utils import decorators, echo
+from aiida.cmdline.commands.cmd_daemon import validate_daemon_workers
 from aiida.cmdline.params import options
 from aiida_workgraph.engine.scheduler.client import get_scheduler_client
 import sys
@@ -31,7 +32,7 @@ def scheduler():
 @scheduler.command()
 def worker():
     """Start the scheduler application."""
-    from aiida_workgraph.engine.launch import start_scheduler_worker
+    from aiida_workgraph.engine.scheduler.client import start_scheduler_worker
 
     click.echo("Starting the scheduler worker...")
 
@@ -40,17 +41,20 @@ def worker():
 
 @scheduler.command()
 @click.option("--foreground", is_flag=True, help="Run in foreground.")
+@click.argument("number", required=False, type=int, callback=validate_daemon_workers)
 @options.TIMEOUT(default=None, required=False, type=int)
 @decorators.with_dbenv()
 @decorators.requires_broker
 @decorators.check_circus_zmq_version
-def start(foreground, timeout):
+def start(foreground, number, timeout):
     """Start the scheduler application."""
+    from aiida_workgraph.engine.scheduler.client import start_scheduler_process
 
     click.echo("Starting the scheduler process...")
 
     client = get_scheduler_client()
-    client.start_daemon(foreground=foreground)
+    client.start_daemon(number_workers=number, foreground=foreground, timeout=timeout)
+    start_scheduler_process(number)
 
 
 @scheduler.command()
@@ -86,10 +90,11 @@ def stop(ctx, no_wait, all_profiles, timeout):
 
 @scheduler.command(hidden=True)
 @click.option("--foreground", is_flag=True, help="Run in foreground.")
+@click.argument("number", required=False, type=int, callback=validate_daemon_workers)
 @decorators.with_dbenv()
 @decorators.requires_broker
 @decorators.check_circus_zmq_version
-def start_circus(foreground):
+def start_circus(foreground, number):
     """This will actually launch the circus daemon, either daemonized in the background or in the foreground.
 
     If run in the foreground all logs are redirected to stdout.
@@ -97,7 +102,7 @@ def start_circus(foreground):
     .. note:: this should not be called directly from the commandline!
     """
 
-    get_scheduler_client()._start_daemon(foreground=foreground)
+    get_scheduler_client()._start_daemon(number_workers=number, foreground=foreground)
 
 
 @scheduler.command()
diff --git a/aiida_workgraph/engine/launch.py b/aiida_workgraph/engine/launch.py
index 96324c75..e5a43c41 100644
--- a/aiida_workgraph/engine/launch.py
+++ b/aiida_workgraph/engine/launch.py
@@ -170,80 +170,3 @@ def submit(
         time.sleep(wait_interval)
 
     return node
-
-
-def start_scheduler_worker(foreground: bool = False) -> None:
-    """Start a scheduler worker for the currently configured profile.
-
-    :param foreground: If true, the logging will be configured to write to stdout, otherwise it will be configured to
-        write to the scheduler log file.
-    """
-    import asyncio
-    import signal
-    import sys
-
-    from aiida.common.log import configure_logging
-    from aiida.manage import get_config_option, get_manager
-    from aiida_workgraph.engine.scheduler import WorkGraphScheduler
-    from aiida_workgraph.engine.scheduler.client import (
-        get_scheduler_client,
-        get_scheduler,
-    )
-    from aiida.engine.processes.launcher import ProcessLauncher
-    from aiida.engine import persistence
-    from plumpy.persistence import LoadSaveContext
-    from aiida.engine.daemon.worker import shutdown_worker
-
-    daemon_client = get_scheduler_client()
-    configure_logging(
-        daemon=not foreground, daemon_log_file=daemon_client.daemon_log_file
-    )
-
-    LOGGER.debug(f"sys.executable: {sys.executable}")
-    LOGGER.debug(f"sys.path: {sys.path}")
-
-    try:
-        manager = get_manager()
-        # runner = manager.create_daemon_runner()
-        runner = manager.create_runner(broker_submit=True)
-        manager.set_runner(runner)
-    except Exception:
-        LOGGER.exception("daemon worker failed to start")
-        raise
-
-    if isinstance(rlimit := get_config_option("daemon.recursion_limit"), int):
-        LOGGER.info("Setting maximum recursion limit of daemon worker to %s", rlimit)
-        sys.setrecursionlimit(rlimit)
-
-    signals = (signal.SIGTERM, signal.SIGINT)
-    for s in signals:
-        # https://github.com/python/mypy/issues/12557
-        runner.loop.add_signal_handler(s, lambda s=s: asyncio.create_task(shutdown_worker(runner)))  # type: ignore[misc]
-
-    try:
-        running_scheduler = get_scheduler()
-        runner_loop = runner.loop
-        task_receiver = ProcessLauncher(
-            loop=runner_loop,
-            persister=manager.get_persister(),
-            load_context=LoadSaveContext(runner=runner),
-            loader=persistence.get_object_loader(),
-        )
-        asyncio.run(
-            task_receiver._continue(
-                communicator=None, pid=running_scheduler, nowait=True
-            )
-        )
-    except ValueError:
-        print("Starting a new Scheduler")
-        process_inited = instantiate_process(runner, WorkGraphScheduler)
-        runner.loop.create_task(process_inited.step_until_terminated())
-
-    try:
-        LOGGER.info("Starting a daemon worker")
-        runner.start()
-    except SystemError as exception:
-        LOGGER.info("Received a SystemError: %s", exception)
-        runner.close()
-
-    LOGGER.info("Daemon worker started")
diff --git a/aiida_workgraph/engine/override.py b/aiida_workgraph/engine/override.py
new file mode 100644
index 00000000..f08d2f31
--- /dev/null
+++ b/aiida_workgraph/engine/override.py
@@ -0,0 +1,65 @@
+from plumpy.process_comms import RemoteProcessThreadController
+from typing import Any, Optional
+
+
+def create_daemon_runner(
+    manager, queue_name: str = None, loop: Optional["asyncio.AbstractEventLoop"] = None
+) -> "Runner":
+    """Create and return a new daemon runner.
+    This is used by workers when the daemon is running and in testing.
+    :param loop: the (optional) asyncio event loop to use
+    :return: a runner configured to work in the daemon configuration
+    """
+    from plumpy.persistence import LoadSaveContext
+    from aiida.engine import persistence
+    from aiida.engine.processes.launcher import ProcessLauncher
+    from plumpy.communications import convert_to_comm
+
+    runner = manager.create_runner(broker_submit=True, loop=loop)
+    runner_loop = runner.loop
+    # Listen for incoming launch requests
+    task_receiver = ProcessLauncher(
+        loop=runner_loop,
+        persister=manager.get_persister(),
+        load_context=LoadSaveContext(runner=runner),
+        loader=persistence.get_object_loader(),
+    )
+
+    def callback(_comm, msg):
+        print("Received message: {}".format(msg))
+        import asyncio
+
+        asyncio.run(task_receiver(_comm, msg))
+        print("task_receiver._continue done")
+        return True
+
+    assert runner.communicator is not None, "communicator not set for runner"
+    if queue_name is not None:
+        print("queue_name: {}".format(queue_name))
+        queue = runner.communicator._communicator.task_queue(
+            queue_name, prefetch_count=1
+        )
+        # queue.add_task_subscriber(callback)
+        # important to convert the callback
+        converted = convert_to_comm(task_receiver, runner.communicator._loop)
+        queue.add_task_subscriber(converted)
+    else:
+        runner.communicator.add_task_subscriber(task_receiver)
+    return runner
+
+
+class ControllerWithQueueName(RemoteProcessThreadController):
+    def __init__(self, queue_name: str, **kwargs):
+        super().__init__(**kwargs)
+        self.queue_name = queue_name
+
+    def task_send(self, message: Any, no_reply: bool = False) -> Optional[Any]:
+        """
+        Send a task to be performed using the communicator
+
+        :param message: the task message
+        :param no_reply: if True, this call will be fire-and-forget, i.e. no return value
+        :return: the response from the remote side (if no_reply=False)
+        """
+        queue = self._communicator.task_queue(self.queue_name)
+        return queue.task_send(message, no_reply=no_reply)
diff --git a/aiida_workgraph/engine/scheduler/client.py b/aiida_workgraph/engine/scheduler/client.py
index 1774eb64..2c019128 100644
--- a/aiida_workgraph/engine/scheduler/client.py
+++ b/aiida_workgraph/engine/scheduler/client.py
@@ -4,8 +4,11 @@
 from aiida.common.exceptions import ConfigurationError
 import os
 from typing import Optional
+from aiida.common.log import AIIDA_LOGGER
+from typing import List
 
 WORKGRAPH_BIN = shutil.which("workgraph")
+LOGGER = AIIDA_LOGGER.getChild("engine.launch")
 
 
 class SchedulerClient(DaemonClient):
@@ -102,6 +105,7 @@ def cmd_start_daemon(
             self.profile.name,
             "scheduler",
             "start-circus",
+            str(number_workers),
         ]
 
         if foreground:
@@ -114,7 +118,7 @@ def cmd_start_daemon_worker(self) -> list[str]:
         """Return the command to start a daemon worker process."""
         return [self._workgraph_bin, "-p", self.profile.name, "scheduler", "worker"]
 
-    def _start_daemon(self, foreground: bool = False) -> None:
+    def _start_daemon(self, number_workers: int = 1, foreground: bool = False) -> None:
         """Start the daemon.
 
         .. warning:: This will daemonize the current process and put it in the background. It is most likely not what
@@ -149,7 +153,7 @@ def _start_daemon(self, foreground: bool = False) -> None:
                 {
                     "cmd": " ".join(self.cmd_start_daemon_worker),
                     "name": self.daemon_name,
-                    "numprocesses": 1,
+                    "numprocesses": number_workers,
                     "virtualenv": self.virtualenv,
                     "copy_env": True,
                     "stdout_stream": {
@@ -210,7 +214,7 @@ def get_scheduler_client(profile_name: Optional[str] = None) -> "SchedulerClient
     return SchedulerClient(profile)
 
 
-def get_scheduler():
+def get_scheduler() -> List[int]:
     from aiida.orm import QueryBuilder
     from aiida_workgraph.engine.scheduler import WorkGraphScheduler
 
@@ -224,7 +228,97 @@ def get_scheduler():
     }
     qb.append(WorkGraphScheduler, filters=filters, project=projections, tag="process")
     results = qb.all()
-    if len(results) == 0:
-        raise ValueError("No scheduler found. Please start the scheduler first.")
-    scheduler_id = results[0][0]
-    return scheduler_id
+    pks = [r[0] for r in results]
+    return pks
+
+
+def start_scheduler_worker(foreground: bool = False) -> None:
+    """Start a scheduler worker for the currently configured profile.
+
+    :param foreground: If true, the logging will be configured to write to stdout, otherwise it will be configured to
+        write to the scheduler log file.
+    """
+    import asyncio
+    import signal
+    import sys
+    from aiida_workgraph.engine.scheduler.client import get_scheduler_client
+    from aiida_workgraph.engine.override import create_daemon_runner
+
+    from aiida.common.log import configure_logging
+    from aiida.manage import get_config_option
+    from aiida.engine.daemon.worker import shutdown_worker
+
+    daemon_client = get_scheduler_client()
+    configure_logging(
+        daemon=not foreground, daemon_log_file=daemon_client.daemon_log_file
+    )
+
+    LOGGER.debug(f"sys.executable: {sys.executable}")
+    LOGGER.debug(f"sys.path: {sys.path}")
+
+    try:
+        manager = get_manager()
+        runner = create_daemon_runner(manager, queue_name="scheduler_queue")
+    except Exception:
+        LOGGER.exception("daemon worker failed to start")
+        raise
+
+    if isinstance(rlimit := get_config_option("daemon.recursion_limit"), int):
+        LOGGER.info("Setting maximum recursion limit of daemon worker to %s", rlimit)
+        sys.setrecursionlimit(rlimit)
+
+    signals = (signal.SIGTERM, signal.SIGINT)
+    for s in signals:
+        # https://github.com/python/mypy/issues/12557
+        runner.loop.add_signal_handler(s, lambda s=s: asyncio.create_task(shutdown_worker(runner)))  # type: ignore[misc]
+
+    try:
+        LOGGER.info("Starting a daemon worker")
+        runner.start()
+    except SystemError as exception:
+        LOGGER.info("Received a SystemError: %s", exception)
+        runner.close()
+
+    LOGGER.info("Daemon worker started")
+
+
+def start_scheduler_process(number: int = 1) -> None:
+    """Start or restart the specified number of scheduler processes."""
+    from aiida_workgraph.engine.scheduler import WorkGraphScheduler
+    from aiida_workgraph.engine.scheduler.client import get_scheduler
+    from aiida_workgraph.utils.control import create_scheduler_action
+    from aiida_workgraph.engine.utils import instantiate_process
+
+    try:
+        schedulers: List[int] = get_scheduler()
+        existing_schedulers_count = len(schedulers)
+        print(
+            "Found {} existing scheduler(s): {}".format(
+                existing_schedulers_count, " ".join([str(pk) for pk in schedulers])
+            )
+        )
+
+        count = 0
+
+        # Restart existing schedulers if they exceed the number to start
+        if existing_schedulers_count > number:
+            for pk in schedulers[:number]:
+                create_scheduler_action(pk)
+                print(f"Scheduler with pk {pk} restarted.")
+                count += 1
+        else:
+            count = existing_schedulers_count
+
+        # Start new schedulers if more are needed
+        runner = get_manager().get_runner()
+        for i in range(count, number):
+            process_inited = instantiate_process(runner, WorkGraphScheduler)
+            process_inited.runner.persister.save_checkpoint(process_inited)
+            process_inited.close()
+            create_scheduler_action(process_inited.node.pk)
+            print(f"Scheduler with pk {process_inited.node.pk} started.")
+
+        print(f"Total schedulers running: {number}")
+
+    except Exception as e:
+        raise (f"An error occurred while starting schedulers: {e}")
diff --git a/aiida_workgraph/engine/scheduler/scheduler.py b/aiida_workgraph/engine/scheduler/scheduler.py
index 5d91c489..71f4d3c3 100644
--- a/aiida_workgraph/engine/scheduler/scheduler.py
+++ b/aiida_workgraph/engine/scheduler/scheduler.py
@@ -1637,18 +1637,18 @@ def message_receive(
     def call_on_receive_workgraph_message(self, _comm, msg):
         """Call on receive workgraph message."""
         # self.report(f"Received workgraph message: {msg}")
-        pk = int(msg)
+        pk = msg["args"]["pid"]
         # To avoid "DbNode is not persistent", we need to schedule the call
         self._schedule_rpc(self.launch_workgraph, pk=pk)
         return True
 
     def add_workgraph_subsriber(self) -> None:
         """Add workgraph subscriber."""
-        queue_name = "scheduler_queue"
-        # self.report(f"Add workgraph subscriber on queue: {queue_name}")
+        queue_name = "workgraph_queue"
+        self.report(f"Add workgraph subscriber on queue: {queue_name}")
         comm = self.runner.communicator._communicator
         queue = comm.task_queue(queue_name, prefetch_count=1000)
-        queue.add_task_subscriber(self.callback)
+        queue.add_task_subscriber(self.call_on_receive_workgraph_message)
 
     def finalize_workgraph(self, pk: int) -> t.Optional[ExitCode]:
         """"""
diff --git a/aiida_workgraph/utils/control.py b/aiida_workgraph/utils/control.py
index 8b54f439..4fa50be6 100644
--- a/aiida_workgraph/utils/control.py
+++ b/aiida_workgraph/utils/control.py
@@ -1,6 +1,7 @@
 from aiida.manage import get_manager
 from aiida import orm
 from aiida.engine.processes import control
+from aiida_workgraph.engine.override import ControllerWithQueueName
 
 
 def create_task_action(
@@ -22,10 +23,23 @@ def create_scheduler_action(
 ):
     """Send workgraph task to scheduler."""
 
-    controller = get_manager().get_process_controller()
-    message = str(pk)
-    queue = controller._communicator.task_queue("scheduler_queue")
-    queue.task_send(message)
+    manager = get_manager()
+    controller = ControllerWithQueueName(
+        queue_name="scheduler_queue", communicator=manager.get_communicator()
+    )
+    controller.continue_process(pk, nowait=False)
+
+
+def create_workgraph_action(
+    pk: int,
+):
+    """Send workgraph task to scheduler."""
+
+    manager = get_manager()
+    controller = ControllerWithQueueName(
+        queue_name="workgraph_queue", communicator=manager.get_communicator()
+    )
+    controller.continue_process(pk, nowait=False)
 
 
 def get_task_state_info(node, name: str, key: str) -> str:
diff --git a/aiida_workgraph/workgraph.py b/aiida_workgraph/workgraph.py
index 647f8778..e99a34f6 100644
--- a/aiida_workgraph/workgraph.py
+++ b/aiida_workgraph/workgraph.py
@@ -124,15 +124,6 @@ def submit(
             restart (bool): Restart the process, and reset the modified tasks, then only re-run the modified tasks.
             new (bool): Submit a new process.
         """
-        from aiida_workgraph.engine.scheduler.client import get_scheduler
-
-        if to_scheduler:
-            try:
-                get_scheduler()
-            except ValueError as e:
-                print(e)
-                return
-
         # set task inputs
         if inputs is not None:
             for name, input in inputs.items():
@@ -439,27 +430,14 @@ def continue_process_in_scheduler(self, to_scheduler: Union[int, bool]) -> None:
         """
         from aiida_workgraph.utils.control import (
             create_task_action,
-            create_scheduler_action,
+            create_workgraph_action,
         )
-        from aiida_workgraph.engine.scheduler.client import get_scheduler
-        import kiwipy
 
         try:
-            if isinstance(to_scheduler, int):
-                scheduler_pk = get_scheduler()
-                create_task_action(scheduler_pk, [self.pk], action="launch_workgraph")
+            if isinstance(to_scheduler, int) and not isinstance(to_scheduler, bool):
+                create_task_action(to_scheduler, [self.pk], action="launch_workgraph")
             else:
-                create_scheduler_action(self.pk)
-        except ValueError:
-            print(
-                """Scheduler is not running.
-Please start the scheduler first with `aiida-workgraph scheduler start`"""
-            )
-        except kiwipy.exceptions.UnroutableError:
-            print(
-                """Scheduler exists, but the daemon is not running.
-Please start the scheduler first with `aiida-workgraph scheduler start`"""
-            )
+                create_workgraph_action(self.pk)
         except Exception as e:
             print("""An unexpected error occurred:""", e)
 
diff --git a/docs/source/howto/scheduler.ipynb b/docs/source/howto/scheduler.ipynb
index ad046289..97ece6a1 100644
--- a/docs/source/howto/scheduler.ipynb
+++ b/docs/source/howto/scheduler.ipynb
@@ -6,42 +6,91 @@
    "source": [
     "# Scheduler\n",
     "\n",
-    "Start a scheduler daemon:\n",
+    "## Overview\n",
+    "\n",
+    "This documentation provides a guide on using the `aiida-workgraph` Scheduler to manage `WorkGraph` processes efficiently.\n",
+    "\n",
+    "### Background\n",
+    "\n",
+    "Traditional workflow processes, particularly in nested structures like `PwBandsWorkChain`, tend to create multiple Workflow processes in a waiting state, while only a few `CalcJob` processes run actively. This results in inefficient resource usage. The `WorkChain` structure makes it challenging to eliminate these waiting processes due to its encapsulated logic.\n",
+    "\n",
+    "In contrast, the `WorkGraph` offers a more clear task dependency and allow other process to run its tasks in a controllable way. In a scheduler, one only need create the `WorkGraph` process in the database, not run it via a daemon worker.\n",
+    "\n",
+    "### Process Comparison: `PwBands` Case\n",
+    "\n",
+    "- **Old Approach**: 300 Workflow processes (Bands, Relax, Base) + 100 CalcJob processes.\n",
+    "- **New Approach**: 1 Scheduler process + 100 CalcJob processes.\n",
+    "\n",
+    "This new approach significantly reduces the number of active processes and mitigates the risk of deadlocks.\n",
+    "\n",
+    "## Getting Started with the Scheduler\n",
+    "\n",
+    "### Starting the Scheduler\n",
+    "\n",
+    "To launch a scheduler daemon:\n",
     "\n",
     "```console\n",
     "workgraph scheduler start\n",
     "```\n",
     "\n",
-    "Check the status of the scheduler:\n",
+    "### Monitoring the Scheduler\n",
+    "\n",
+    "To check the current status of the scheduler:\n",
     "\n",
     "```console\n",
     "workgraph scheduler status\n",
     "```\n",
     "\n",
-    "Stop the scheduler:\n",
+    "### Stopping the Scheduler\n",
+    "\n",
+    "To stop the scheduler daemon:\n",
     "\n",
     "```console\n",
     "workgraph scheduler stop\n",
     "```\n",
     "\n",
-    "## Submit workgraph to the scheduler\n",
-    "Set `to_scheduler` to `True` when submitting a workgraph to the scheduler:\n",
+    "## Submitting WorkGraphs to the Scheduler\n",
+    "\n",
+    "To submit a WorkGraph to the scheduler, set the `to_scheduler` flag to `True`:\n",
     "\n",
     "```python\n",
     "wg.submit(to_scheduler=True)\n",
-    "```"
+    "```\n",
+    "\n",
+    "\n",
+    "### Using Multiple Schedulers\n",
+    "\n",
+    "For environments with a high volume of WorkGraphs, starting multiple schedulers can enhance throughput:\n",
+    "\n",
+    "```console\n",
+    "workgraph scheduler start 2\n",
+    "```\n",
+    "\n",
+    "WorkGraphs will be automatically distributed among available schedulers.\n",
+    "\n",
+    "#### Specifying a Scheduler\n",
+    "\n",
+    "To submit a WorkGraph to a specific scheduler using its primary key (`pk`):\n",
+    "\n",
+    "```python\n",
+    "wg.submit(to_scheduler=pk_scheduler)\n",
+    "```\n",
+    "\n",
+    "### Best Practices for Scheduler Usage\n",
+    "\n",
+    "While a single scheduler suffices for most use cases, scaling up the number of schedulers may be beneficial when significantly increasing the number of workers. A general rule is to maintain a ratio of less than 5 workers per scheduler.\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "WorkGraph process created, PK: 134971\n",
+      "WorkGraph process created, PK: 142617\n",
       "State of WorkGraph         : FINISHED\n",
       "Result of add2            : 4\n"
      ]
@@ -78,7 +127,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -94,41 +143,41 @@
        " viewBox=\"0.00 0.00 1036.43 720.06\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
        "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 716.06)\">\n",
        "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-716.06 1032.43,-716.06 1032.43,4 -4,4\"/>\n",
-       "<!-- N134971 -->\n",
+       "<!-- N142617 -->\n",
        "<g id=\"node1\" class=\"node\">\n",
-       "<title>N134971</title>\n",
+       "<title>N142617</title>\n",
        "<polygon fill=\"#e38851\" stroke=\"red\" stroke-width=\"6\" points=\"719.81,-581.46 425.56,-581.46 425.56,-521.71 719.81,-521.71 719.81,-581.46\"/>\n",
-       "<text text-anchor=\"middle\" x=\"572.68\" y=\"-564.16\" font-family=\"Times,serif\" font-size=\"14.00\">WorkGraph&lt;test_scheduler&gt; (134971)</text>\n",
+       "<text text-anchor=\"middle\" x=\"572.68\" y=\"-564.16\" font-family=\"Times,serif\" font-size=\"14.00\">WorkGraph&lt;test_scheduler&gt; (142617)</text>\n",
        "<text text-anchor=\"middle\" x=\"572.68\" y=\"-546.91\" font-family=\"Times,serif\" font-size=\"14.00\">State: finished</text>\n",
        "<text text-anchor=\"middle\" x=\"572.68\" y=\"-529.66\" font-family=\"Times,serif\" font-size=\"14.00\">Exit Code: 0</text>\n",
        "</g>\n",
-       "<!-- N134974 -->\n",
+       "<!-- N142620 -->\n",
        "<g id=\"node3\" class=\"node\">\n",
-       "<title>N134974</title>\n",
+       "<title>N142620</title>\n",
        "<polygon fill=\"#de707f\" stroke=\"black\" stroke-width=\"0\" points=\"588.06,-451.21 319.31,-451.21 319.31,-391.46 588.06,-391.46 588.06,-451.21\"/>\n",
-       "<text text-anchor=\"middle\" x=\"453.68\" y=\"-433.91\" font-family=\"Times,serif\" font-size=\"14.00\">ArithmeticAddCalculation (134974)</text>\n",
+       "<text text-anchor=\"middle\" x=\"453.68\" y=\"-433.91\" font-family=\"Times,serif\" font-size=\"14.00\">ArithmeticAddCalculation (142620)</text>\n",
        "<text text-anchor=\"middle\" x=\"453.68\" y=\"-416.66\" font-family=\"Times,serif\" font-size=\"14.00\">State: finished</text>\n",
        "<text text-anchor=\"middle\" x=\"453.68\" y=\"-399.41\" font-family=\"Times,serif\" font-size=\"14.00\">Exit Code: 0</text>\n",
        "</g>\n",
-       "<!-- N134971&#45;&gt;N134974 -->\n",
-       "<g id=\"edge3\" class=\"edge\">\n",
-       "<title>N134971&#45;&gt;N134974</title>\n",
+       "<!-- N142617&#45;&gt;N142620 -->\n",
+       "<g id=\"edge8\" class=\"edge\">\n",
+       "<title>N142617&#45;&gt;N142620</title>\n",
        "<path fill=\"none\" stroke=\"#000000\" stroke-dasharray=\"1,5\" d=\"M543.27,-518.88C526.56,-500.87 505.55,-478.23 488.07,-459.4\"/>\n",
        "<polygon fill=\"#000000\" stroke=\"#000000\" points=\"490.72,-457.1 481.35,-452.15 485.59,-461.87 490.72,-457.1\"/>\n",
        "<text text-anchor=\"middle\" x=\"570.06\" y=\"-490.41\" font-family=\"Times,serif\" font-size=\"14.00\">CALL_CALC</text>\n",
        "<text text-anchor=\"middle\" x=\"570.06\" y=\"-473.16\" font-family=\"Times,serif\" font-size=\"14.00\">add1</text>\n",
        "</g>\n",
-       "<!-- N134979 -->\n",
+       "<!-- N142625 -->\n",
        "<g id=\"node7\" class=\"node\">\n",
-       "<title>N134979</title>\n",
+       "<title>N142625</title>\n",
        "<polygon fill=\"#de707f\" stroke=\"black\" stroke-width=\"0\" points=\"770.06,-190.35 501.31,-190.35 501.31,-130.6 770.06,-130.6 770.06,-190.35\"/>\n",
-       "<text text-anchor=\"middle\" x=\"635.68\" y=\"-173.05\" font-family=\"Times,serif\" font-size=\"14.00\">ArithmeticAddCalculation (134979)</text>\n",
+       "<text text-anchor=\"middle\" x=\"635.68\" y=\"-173.05\" font-family=\"Times,serif\" font-size=\"14.00\">ArithmeticAddCalculation (142625)</text>\n",
        "<text text-anchor=\"middle\" x=\"635.68\" y=\"-155.8\" font-family=\"Times,serif\" font-size=\"14.00\">State: finished</text>\n",
        "<text text-anchor=\"middle\" x=\"635.68\" y=\"-138.55\" font-family=\"Times,serif\" font-size=\"14.00\">Exit Code: 0</text>\n",
        "</g>\n",
-       "<!-- N134971&#45;&gt;N134979 -->\n",
-       "<g id=\"edge9\" class=\"edge\">\n",
-       "<title>N134971&#45;&gt;N134979</title>\n",
+       "<!-- N142617&#45;&gt;N142625 -->\n",
+       "<g id=\"edge3\" class=\"edge\">\n",
+       "<title>N142617&#45;&gt;N142625</title>\n",
        "<path fill=\"none\" stroke=\"#000000\" stroke-dasharray=\"1,5\" d=\"M606.09,-518.88C610.04,-514.07 613.71,-508.95 616.68,-503.71 658.31,-430.24 652.76,-403.89 668.68,-320.96 678.14,-271.69 694.33,-255.32 676.68,-208.35 675.54,-205.32 674.11,-202.35 672.46,-199.47\"/>\n",
        "<polygon fill=\"#000000\" stroke=\"#000000\" points=\"675.49,-197.71 667.01,-191.36 669.68,-201.61 675.49,-197.71\"/>\n",
        "<text text-anchor=\"middle\" x=\"708.06\" y=\"-360.16\" font-family=\"Times,serif\" font-size=\"14.00\">CALL_CALC</text>\n",
@@ -141,111 +190,111 @@
        "<text text-anchor=\"middle\" x=\"572.68\" y=\"-685.96\" font-family=\"Times,serif\" font-size=\"14.00\">InstalledCode (37)</text>\n",
        "<text text-anchor=\"middle\" x=\"572.68\" y=\"-668.71\" font-family=\"Times,serif\" font-size=\"14.00\">add@localhost</text>\n",
        "</g>\n",
-       "<!-- N37&#45;&gt;N134971 -->\n",
+       "<!-- N37&#45;&gt;N142617 -->\n",
        "<g id=\"edge1\" class=\"edge\">\n",
-       "<title>N37&#45;&gt;N134971</title>\n",
+       "<title>N37&#45;&gt;N142617</title>\n",
        "<path fill=\"none\" stroke=\"#000000\" stroke-dasharray=\"5,2\" d=\"M482.45,-667.05C431.48,-655.11 382.74,-634.2 408.93,-599.46 411.58,-595.94 415.84,-592.61 421.28,-589.46\"/>\n",
        "<polygon fill=\"#000000\" stroke=\"#000000\" points=\"422.76,-592.63 430.14,-585.03 419.63,-586.37 422.76,-592.63\"/>\n",
        "<text text-anchor=\"middle\" x=\"563.06\" y=\"-620.66\" font-family=\"Times,serif\" font-size=\"14.00\">INPUT_WORK</text>\n",
-       "<text text-anchor=\"middle\" x=\"563.06\" y=\"-603.41\" font-family=\"Times,serif\" font-size=\"14.00\">wg__tasks__add2__properties__code__value</text>\n",
+       "<text text-anchor=\"middle\" x=\"563.06\" y=\"-603.41\" font-family=\"Times,serif\" font-size=\"14.00\">wg__tasks__add1__properties__code__value</text>\n",
        "</g>\n",
-       "<!-- N37&#45;&gt;N134971 -->\n",
+       "<!-- N37&#45;&gt;N142617 -->\n",
        "<g id=\"edge2\" class=\"edge\">\n",
-       "<title>N37&#45;&gt;N134971</title>\n",
+       "<title>N37&#45;&gt;N142617</title>\n",
        "<path fill=\"none\" stroke=\"#000000\" stroke-dasharray=\"5,2\" d=\"M665.87,-668.62C685.33,-661.57 703.58,-650.68 715.68,-633.96 724.67,-621.54 724.68,-611.87 715.68,-599.46 713.74,-596.78 711.65,-594.25 709.41,-591.87\"/>\n",
        "<polygon fill=\"#000000\" stroke=\"#000000\" points=\"712.11,-589.6 702.42,-585.33 707.33,-594.71 712.11,-589.6\"/>\n",
        "<text text-anchor=\"middle\" x=\"875.06\" y=\"-620.66\" font-family=\"Times,serif\" font-size=\"14.00\">INPUT_WORK</text>\n",
-       "<text text-anchor=\"middle\" x=\"875.06\" y=\"-603.41\" font-family=\"Times,serif\" font-size=\"14.00\">wg__tasks__add1__properties__code__value</text>\n",
+       "<text text-anchor=\"middle\" x=\"875.06\" y=\"-603.41\" font-family=\"Times,serif\" font-size=\"14.00\">wg__tasks__add2__properties__code__value</text>\n",
        "</g>\n",
-       "<!-- N134975 -->\n",
+       "<!-- N142621 -->\n",
        "<g id=\"node4\" class=\"node\">\n",
-       "<title>N134975</title>\n",
+       "<title>N142621</title>\n",
        "<ellipse fill=\"#8cd499\" stroke=\"black\" stroke-width=\"0\" cx=\"122.68\" cy=\"-290.91\" rx=\"122.68\" ry=\"30.05\"/>\n",
-       "<text text-anchor=\"middle\" x=\"122.68\" y=\"-294.86\" font-family=\"Times,serif\" font-size=\"14.00\">RemoteData (134975)</text>\n",
+       "<text text-anchor=\"middle\" x=\"122.68\" y=\"-294.86\" font-family=\"Times,serif\" font-size=\"14.00\">RemoteData (142621)</text>\n",
        "<text text-anchor=\"middle\" x=\"122.68\" y=\"-277.61\" font-family=\"Times,serif\" font-size=\"14.00\">@localhost</text>\n",
        "</g>\n",
-       "<!-- N134974&#45;&gt;N134975 -->\n",
-       "<g id=\"edge6\" class=\"edge\">\n",
-       "<title>N134974&#45;&gt;N134975</title>\n",
+       "<!-- N142620&#45;&gt;N142621 -->\n",
+       "<g id=\"edge11\" class=\"edge\">\n",
+       "<title>N142620&#45;&gt;N142621</title>\n",
        "<path fill=\"none\" stroke=\"#000000\" d=\"M355.64,-391.49C338.29,-385.86 320.49,-379.75 303.93,-373.46 264.18,-358.35 220.69,-338.89 186.28,-322.77\"/>\n",
        "<polygon fill=\"#000000\" stroke=\"#000000\" points=\"187.81,-319.62 177.28,-318.52 184.83,-325.95 187.81,-319.62\"/>\n",
        "<text text-anchor=\"middle\" x=\"354.56\" y=\"-360.16\" font-family=\"Times,serif\" font-size=\"14.00\">CREATE</text>\n",
        "<text text-anchor=\"middle\" x=\"354.56\" y=\"-342.91\" font-family=\"Times,serif\" font-size=\"14.00\">remote_folder</text>\n",
        "</g>\n",
-       "<!-- N134976 -->\n",
+       "<!-- N142622 -->\n",
        "<g id=\"node5\" class=\"node\">\n",
-       "<title>N134976</title>\n",
+       "<title>N142622</title>\n",
        "<ellipse fill=\"#8cd499\" stroke=\"black\" stroke-width=\"0\" cx=\"378.68\" cy=\"-290.91\" rx=\"115.39\" ry=\"18\"/>\n",
-       "<text text-anchor=\"middle\" x=\"378.68\" y=\"-286.23\" font-family=\"Times,serif\" font-size=\"14.00\">FolderData (134976)</text>\n",
+       "<text text-anchor=\"middle\" x=\"378.68\" y=\"-286.23\" font-family=\"Times,serif\" font-size=\"14.00\">FolderData (142622)</text>\n",
        "</g>\n",
-       "<!-- N134974&#45;&gt;N134976 -->\n",
-       "<g id=\"edge5\" class=\"edge\">\n",
-       "<title>N134974&#45;&gt;N134976</title>\n",
+       "<!-- N142620&#45;&gt;N142622 -->\n",
+       "<g id=\"edge9\" class=\"edge\">\n",
+       "<title>N142620&#45;&gt;N142622</title>\n",
        "<path fill=\"none\" stroke=\"#000000\" d=\"M436.85,-391.51C424.08,-369.65 406.67,-339.83 394.23,-318.52\"/>\n",
        "<polygon fill=\"#000000\" stroke=\"#000000\" points=\"397.39,-317 389.33,-310.13 391.35,-320.53 397.39,-317\"/>\n",
        "<text text-anchor=\"middle\" x=\"458.68\" y=\"-360.16\" font-family=\"Times,serif\" font-size=\"14.00\">CREATE</text>\n",
        "<text text-anchor=\"middle\" x=\"458.68\" y=\"-342.91\" font-family=\"Times,serif\" font-size=\"14.00\">retrieved</text>\n",
        "</g>\n",
-       "<!-- N134977 -->\n",
+       "<!-- N142623 -->\n",
        "<g id=\"node6\" class=\"node\">\n",
-       "<title>N134977</title>\n",
+       "<title>N142623</title>\n",
        "<ellipse fill=\"#8cd499\" stroke=\"black\" stroke-width=\"0\" cx=\"585.68\" cy=\"-290.91\" rx=\"73.83\" ry=\"18\"/>\n",
-       "<text text-anchor=\"middle\" x=\"585.68\" y=\"-286.23\" font-family=\"Times,serif\" font-size=\"14.00\">Int (134977)</text>\n",
+       "<text text-anchor=\"middle\" x=\"585.68\" y=\"-286.23\" font-family=\"Times,serif\" font-size=\"14.00\">Int (142623)</text>\n",
        "</g>\n",
-       "<!-- N134974&#45;&gt;N134977 -->\n",
-       "<g id=\"edge4\" class=\"edge\">\n",
-       "<title>N134974&#45;&gt;N134977</title>\n",
+       "<!-- N142620&#45;&gt;N142623 -->\n",
+       "<g id=\"edge6\" class=\"edge\">\n",
+       "<title>N142620&#45;&gt;N142623</title>\n",
        "<path fill=\"none\" stroke=\"#000000\" d=\"M483.31,-391.51C506.66,-368.79 538.85,-337.47 560.86,-316.06\"/>\n",
        "<polygon fill=\"#000000\" stroke=\"#000000\" points=\"563.23,-318.63 567.96,-309.15 558.35,-313.62 563.23,-318.63\"/>\n",
        "<text text-anchor=\"middle\" x=\"565.68\" y=\"-360.16\" font-family=\"Times,serif\" font-size=\"14.00\">CREATE</text>\n",
        "<text text-anchor=\"middle\" x=\"565.68\" y=\"-342.91\" font-family=\"Times,serif\" font-size=\"14.00\">sum</text>\n",
        "</g>\n",
-       "<!-- N134977&#45;&gt;N134979 -->\n",
-       "<g id=\"edge11\" class=\"edge\">\n",
-       "<title>N134977&#45;&gt;N134979</title>\n",
+       "<!-- N142623&#45;&gt;N142625 -->\n",
+       "<g id=\"edge7\" class=\"edge\">\n",
+       "<title>N142623&#45;&gt;N142625</title>\n",
        "<path fill=\"none\" stroke=\"#000000\" d=\"M580.97,-273.15C577.09,-256.05 573.59,-229.16 582.93,-208.35 584.37,-205.16 586.14,-202.1 588.15,-199.17\"/>\n",
        "<polygon fill=\"#000000\" stroke=\"#000000\" points=\"590.79,-201.47 594.36,-191.5 585.35,-197.07 590.79,-201.47\"/>\n",
        "<text text-anchor=\"middle\" x=\"630.56\" y=\"-229.55\" font-family=\"Times,serif\" font-size=\"14.00\">INPUT_CALC</text>\n",
        "<text text-anchor=\"middle\" x=\"630.56\" y=\"-212.3\" font-family=\"Times,serif\" font-size=\"14.00\">y</text>\n",
        "</g>\n",
-       "<!-- N134980 -->\n",
+       "<!-- N142626 -->\n",
        "<g id=\"node8\" class=\"node\">\n",
-       "<title>N134980</title>\n",
+       "<title>N142626</title>\n",
        "<ellipse fill=\"#8cd499\" stroke=\"black\" stroke-width=\"0\" cx=\"379.68\" cy=\"-30.05\" rx=\"122.68\" ry=\"30.05\"/>\n",
-       "<text text-anchor=\"middle\" x=\"379.68\" y=\"-34\" font-family=\"Times,serif\" font-size=\"14.00\">RemoteData (134980)</text>\n",
+       "<text text-anchor=\"middle\" x=\"379.68\" y=\"-34\" font-family=\"Times,serif\" font-size=\"14.00\">RemoteData (142626)</text>\n",
        "<text text-anchor=\"middle\" x=\"379.68\" y=\"-16.75\" font-family=\"Times,serif\" font-size=\"14.00\">@localhost</text>\n",
        "</g>\n",
-       "<!-- N134979&#45;&gt;N134980 -->\n",
-       "<g id=\"edge8\" class=\"edge\">\n",
-       "<title>N134979&#45;&gt;N134980</title>\n",
+       "<!-- N142625&#45;&gt;N142626 -->\n",
+       "<g id=\"edge4\" class=\"edge\">\n",
+       "<title>N142625&#45;&gt;N142626</title>\n",
        "<path fill=\"none\" stroke=\"#000000\" d=\"M570.06,-130.64C557.34,-124.82 544.17,-118.63 531.93,-112.6 500.18,-96.95 465.32,-78.38 436.99,-62.93\"/>\n",
        "<polygon fill=\"#000000\" stroke=\"#000000\" points=\"439.03,-60.06 428.58,-58.32 435.67,-66.2 439.03,-60.06\"/>\n",
        "<text text-anchor=\"middle\" x=\"582.56\" y=\"-99.3\" font-family=\"Times,serif\" font-size=\"14.00\">CREATE</text>\n",
        "<text text-anchor=\"middle\" x=\"582.56\" y=\"-82.05\" font-family=\"Times,serif\" font-size=\"14.00\">remote_folder</text>\n",
        "</g>\n",
-       "<!-- N134981 -->\n",
+       "<!-- N142627 -->\n",
        "<g id=\"node9\" class=\"node\">\n",
-       "<title>N134981</title>\n",
+       "<title>N142627</title>\n",
        "<ellipse fill=\"#8cd499\" stroke=\"black\" stroke-width=\"0\" cx=\"635.68\" cy=\"-30.05\" rx=\"115.39\" ry=\"18\"/>\n",
-       "<text text-anchor=\"middle\" x=\"635.68\" y=\"-25.38\" font-family=\"Times,serif\" font-size=\"14.00\">FolderData (134981)</text>\n",
+       "<text text-anchor=\"middle\" x=\"635.68\" y=\"-25.38\" font-family=\"Times,serif\" font-size=\"14.00\">FolderData (142627)</text>\n",
        "</g>\n",
-       "<!-- N134979&#45;&gt;N134981 -->\n",
-       "<g id=\"edge7\" class=\"edge\">\n",
-       "<title>N134979&#45;&gt;N134981</title>\n",
+       "<!-- N142625&#45;&gt;N142627 -->\n",
+       "<g id=\"edge5\" class=\"edge\">\n",
+       "<title>N142625&#45;&gt;N142627</title>\n",
        "<path fill=\"none\" stroke=\"#000000\" d=\"M635.68,-130.66C635.68,-109.39 635.68,-80.59 635.68,-59.43\"/>\n",
        "<polygon fill=\"#000000\" stroke=\"#000000\" points=\"639.18,-59.48 635.68,-49.48 632.18,-59.48 639.18,-59.48\"/>\n",
        "<text text-anchor=\"middle\" x=\"668.68\" y=\"-99.3\" font-family=\"Times,serif\" font-size=\"14.00\">CREATE</text>\n",
        "<text text-anchor=\"middle\" x=\"668.68\" y=\"-82.05\" font-family=\"Times,serif\" font-size=\"14.00\">retrieved</text>\n",
        "</g>\n",
-       "<!-- N134982 -->\n",
+       "<!-- N142628 -->\n",
        "<g id=\"node10\" class=\"node\">\n",
-       "<title>N134982</title>\n",
+       "<title>N142628</title>\n",
        "<ellipse fill=\"#8cd499\" stroke=\"black\" stroke-width=\"0\" cx=\"842.68\" cy=\"-30.05\" rx=\"73.83\" ry=\"18\"/>\n",
-       "<text text-anchor=\"middle\" x=\"842.68\" y=\"-25.38\" font-family=\"Times,serif\" font-size=\"14.00\">Int (134982)</text>\n",
+       "<text text-anchor=\"middle\" x=\"842.68\" y=\"-25.38\" font-family=\"Times,serif\" font-size=\"14.00\">Int (142628)</text>\n",
        "</g>\n",
-       "<!-- N134979&#45;&gt;N134982 -->\n",
+       "<!-- N142625&#45;&gt;N142628 -->\n",
        "<g id=\"edge10\" class=\"edge\">\n",
-       "<title>N134979&#45;&gt;N134982</title>\n",
+       "<title>N142625&#45;&gt;N142628</title>\n",
        "<path fill=\"none\" stroke=\"#000000\" d=\"M682.14,-130.66C720.11,-107.1 772.98,-74.3 807.51,-52.87\"/>\n",
        "<polygon fill=\"#000000\" stroke=\"#000000\" points=\"809.23,-55.93 815.88,-47.68 805.54,-49.98 809.23,-55.93\"/>\n",
        "<text text-anchor=\"middle\" x=\"793.68\" y=\"-99.3\" font-family=\"Times,serif\" font-size=\"14.00\">CREATE</text>\n",
@@ -255,10 +304,10 @@
        "</svg>\n"
       ],
       "text/plain": [
-       "<graphviz.graphs.Digraph at 0x7bc5c8265290>"
+       "<graphviz.graphs.Digraph at 0x78fffc5845d0>"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -272,7 +321,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Conclusion"
+    "\n",
+    "## Checkpointing\n",
+    "\n",
+    "The Scheduler checkpoints its status to the database whenever a WorkGraph is updated, ensuring that the Scheduler can recover its state in case of a crash or restart. This feature is particularly useful for long-running WorkGraphs.\n",
+    "\n",
+    "## Conclusion\n",
+    "\n",
+    "The Scheduler offers a streamlined approach to managing complex workflows, significantly reducing active process counts and improving resource efficiency."
    ]
   }
  ],
diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py
index f51a1320..c6308551 100644
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -6,7 +6,7 @@
 from aiida import orm
 
 
-@pytest.skip("Skip for now")
+@pytest.mark.skip("Skip for now")
 @pytest.mark.usefixtures("started_daemon_client")
 def test_scheduler(decorated_add: Callable, started_scheduler_client) -> None:
     """Test graph build."""
@@ -14,7 +14,6 @@ def test_scheduler(decorated_add: Callable, started_scheduler_client) -> None:
     add1 = wg.add_task(decorated_add, x=2, y=3)
     add2 = wg.add_task(decorated_add, "add2", x=3, y=add1.outputs["result"])
     # use run to check if graph builder workgraph can be submit inside the engine
-    pk = get_scheduler()
     wg.submit(to_scheduler=True, wait=True)
     pk = get_scheduler()
     report = get_workchain_report(orm.load(pk), "REPORT")