From fa0aa3a2bb185893de61736cb4c7796307c5b7f6 Mon Sep 17 00:00:00 2001
From: Abel Soares Siqueira <abel.s.siqueira@gmail.com>
Date: Thu, 3 Feb 2022 14:03:34 +0100
Subject: [PATCH 001/183] Add cffconvert.yml to validate CITATION.cff

---
 .github/workflows/cffconvert.yml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 .github/workflows/cffconvert.yml

diff --git a/.github/workflows/cffconvert.yml b/.github/workflows/cffconvert.yml
new file mode 100644
index 00000000..707a71c4
--- /dev/null
+++ b/.github/workflows/cffconvert.yml
@@ -0,0 +1,19 @@
+name: cffconvert
+
+on:
+  push:
+    paths:
+      - CITATION.cff
+
+jobs:
+  validate:
+    name: "validate"
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out a copy of the repository
+        uses: actions/checkout@v2
+
+      - name: Check whether the citation metadata from CITATION.cff is valid
+        uses: citation-file-format/cffconvert-github-action@2.0.0
+        with:
+          args: "--validate"

From a327e141b8e035599ed23dc25d9d8d52232fbb54 Mon Sep 17 00:00:00 2001
From: Abel Soares Siqueira <abel.s.siqueira@gmail.com>
Date: Thu, 3 Feb 2022 14:03:34 +0100
Subject: [PATCH 002/183] Update CITATION.cff cffversion to 1.2.0

---
 CITATION.cff | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CITATION.cff b/CITATION.cff
index 9b2c5304..71dbcb44 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -1,6 +1,6 @@
 # YAML 1.2
 ---
-cff-version: "1.1.0"
+cff-version: 1.2.0
 
 title: "MUSCLE 3: The Multiscale Coupling Library and Environment"
 doi: "10.5281/zenodo.3258864"

From 38761c9676ed14c038072a33079c0c6cdfc0e09b Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 9 Aug 2022 13:34:14 +0200
Subject: [PATCH 003/183] Set --muscle_manager in test_mpi_macro_micro

Fixes the test case integration_test/test_mpi_macro_micro.py from
failing when the default port (9000) is occupied by another process when
running `make test`.
---
 integration_test/test_mpi_macro_micro.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/integration_test/test_mpi_macro_micro.py b/integration_test/test_mpi_macro_micro.py
index af0c4890..2b02fa76 100644
--- a/integration_test/test_mpi_macro_micro.py
+++ b/integration_test/test_mpi_macro_micro.py
@@ -11,8 +11,9 @@
 from .conftest import skip_if_python_only
 
 
-def run_macro(instance_id: str):
+def run_macro(instance_id: str, muscle_manager: str):
     sys.argv.append('--muscle-instance={}'.format(instance_id))
+    sys.argv.append('--muscle-manager={}'.format(muscle_manager))
     macro()
 
 
@@ -61,7 +62,8 @@ def test_mpi_macro_micro(tmpdir, mmp_server_process_simple):
              str(mpi_test_micro), '--muscle-instance=micro'], env=env)
 
     # run macro model
-    macro_process = mp.Process(target=run_macro, args=('macro',))
+    macro_process = mp.Process(target=run_macro,
+                               args=('macro', mmp_server_process_simple))
     macro_process.start()
 
     # check results

From 6eba65c0655fb7dea1b52efddf2da6d6b167888e Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 9 Aug 2022 13:38:05 +0200
Subject: [PATCH 004/183] Fix flake8 error

Fixes the following flake8 v5.0.4 error:
- libmuscle/python/libmuscle/instance.py:443:20: E275 missing whitespace
  after keyword
---
 libmuscle/python/libmuscle/instance.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index e0e6d542..24c10d5a 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -440,7 +440,7 @@ def __receive_message(
         if port.operator == Operator.F_INIT:
             if (port_name, slot) in self._f_init_cache:
                 msg = self._f_init_cache[(port_name, slot)]
-                del(self._f_init_cache[(port_name, slot)])
+                del self._f_init_cache[(port_name, slot)]
                 if with_settings and msg.settings is None:
                     err_msg = ('If you use receive_with_settings()'
                                ' on an F_INIT port, then you have to'

From 36d37e50f54c26b5b0c9b3cafada23e20ad72616 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Thu, 15 Sep 2022 15:57:55 +0200
Subject: [PATCH 005/183] Add ITER Organization as copyright holder. Welcome
 and thank you!

---
 NOTICE              | 1 +
 README.rst          | 3 ++-
 docs/source/conf.py | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/NOTICE b/NOTICE
index 9110265d..2a353538 100644
--- a/NOTICE
+++ b/NOTICE
@@ -1,2 +1,3 @@
 MUSCLE3
 Copyright 2018-2022, Netherlands eScience Center and University of Amsterdam
+Copyright 2022, The ITER Organization
diff --git a/README.rst b/README.rst
index ec9a7352..27405971 100644
--- a/README.rst
+++ b/README.rst
@@ -40,7 +40,8 @@ Legal
 =====
 
 MUSCLE3 is Copyright 2018-2022 University of Amsterdam and Netherlands eScience
-Center. It is licensed under the Apache License 2.0.
+Center, and Copyright 2022 ITER Organisation. It is licensed under the Apache
+License 2.0.
 
 
 Contributing
diff --git a/docs/source/conf.py b/docs/source/conf.py
index c9a0c10c..7464d1c0 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -67,7 +67,7 @@
 
 # General information about the project.
 project = 'muscle3'
-copyright = '2018-2022 University of Amsterdam and Netherlands eScience Center'
+copyright = '2018-2022 University of Amsterdam and Netherlands eScience Center, 2022 The ITER Organization'
 author = 'Lourens Veen'
 
 # The version info for the project you're documenting, acts as replacement for

From 50241cd24e715b83c73027d7c80e8ab53ebe4ee5 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 15 Sep 2022 12:55:27 +0200
Subject: [PATCH 006/183] Reduce delays

- instance_manager.py:
  - Increase polling frequency of LogHandlingThread. Reduces the
    shutdown delay (in InstanceManager.shutdown), saves 0-900 ms.
- qcgpj_instantiator.py
  - Add a 10ms delay at the start of QCGPJInstantiator._main to allow
    the main process some time for submitting InstantiationRequests.
    Saves 90ms in startup duration.
  - Do not sleep in QCGPJInstantiator._main when a shutdown request is
    received. If all instances exited successfully we can immediately be
    done and save a 100ms wait.
- tcp_transport_server.py
  - Set poll_interval of SocketServer.serve_forever to 100ms. Saves
    0-400 ms in shutdown duration (in instances and muscle_manager).

Combined, these changes save 190ms to ~2 seconds for a run started with
`muscle_manager	--start-all`. This is most notable for short runs, like
the ones in the unit tests.

`make test` duration, averaged over 5 runs (no compilation):
- Before this commit: 50.51 seconds
- After this commit: 29.51 seconds
---
 libmuscle/python/libmuscle/manager/instance_manager.py   | 2 +-
 libmuscle/python/libmuscle/manager/qcgpj_instantiator.py | 7 ++++++-
 libmuscle/python/libmuscle/mcp/tcp_transport_server.py   | 2 +-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/libmuscle/python/libmuscle/manager/instance_manager.py b/libmuscle/python/libmuscle/manager/instance_manager.py
index 139c6beb..9d2c8b30 100644
--- a/libmuscle/python/libmuscle/manager/instance_manager.py
+++ b/libmuscle/python/libmuscle/manager/instance_manager.py
@@ -41,7 +41,7 @@ def run(self) -> None:
         """The thread's entry point."""
         while True:
             try:
-                record = self._queue.get(True, 1.0)
+                record = self._queue.get(True, 0.1)
                 logger = logging.getLogger(record.name)
                 logger.handle(record)
             except queue.Empty:
diff --git a/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py b/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py
index e3575b11..89f8bc3e 100644
--- a/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py
+++ b/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py
@@ -148,9 +148,12 @@ async def _main(self) -> None:
         """
         qcg_iters = dict()  # type: Dict[Reference, qcg_SchedulingIteration]
 
+        await asyncio.sleep(0.01)  # allow requests_in queue to be populated
+
         shutting_down = False
         done = False
         while not done:
+            do_sleep = True
             while not shutting_down:
                 try:
                     request = self._requests_in.get_nowait()
@@ -158,6 +161,7 @@ async def _main(self) -> None:
                         _logger.debug('Got ShutdownRequest')
                         self._state_tracker.stop_processing = True
                         shutting_down = True
+                        do_sleep = False
 
                     elif isinstance(request, CancelAllRequest):
                         _logger.debug('Got CancelAllRequest')
@@ -178,7 +182,8 @@ async def _main(self) -> None:
                 except queue.Empty:
                     break
 
-            await asyncio.sleep(0.1)
+            if do_sleep:
+                await asyncio.sleep(0.1)
 
             for name, process in list(self._state_tracker.processes.items()):
                 if process.status.is_finished():
diff --git a/libmuscle/python/libmuscle/mcp/tcp_transport_server.py b/libmuscle/python/libmuscle/mcp/tcp_transport_server.py
index 79513420..2219cd76 100644
--- a/libmuscle/python/libmuscle/mcp/tcp_transport_server.py
+++ b/libmuscle/python/libmuscle/mcp/tcp_transport_server.py
@@ -71,7 +71,7 @@ def __init__(self, handler: RequestHandler, port: int = 0) -> None:
 
         self._server = TcpTransportServerImpl(('', port), TcpHandler, self)
         self._server_thread = threading.Thread(
-                target=self._server.serve_forever, daemon=True)
+                target=self._server.serve_forever, args=(0.1,), daemon=True)
         self._server_thread.start()
 
     def get_location(self) -> str:

From 900d621ab84c4e70d078cab1c6c95ac3c598755f Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 16 Sep 2022 10:20:59 +0200
Subject: [PATCH 007/183] Update qcgpj sleep

---
 libmuscle/python/libmuscle/manager/qcgpj_instantiator.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py b/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py
index 89f8bc3e..a150904d 100644
--- a/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py
+++ b/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py
@@ -153,7 +153,6 @@ async def _main(self) -> None:
         shutting_down = False
         done = False
         while not done:
-            do_sleep = True
             while not shutting_down:
                 try:
                     request = self._requests_in.get_nowait()
@@ -161,7 +160,6 @@ async def _main(self) -> None:
                         _logger.debug('Got ShutdownRequest')
                         self._state_tracker.stop_processing = True
                         shutting_down = True
-                        do_sleep = False
 
                     elif isinstance(request, CancelAllRequest):
                         _logger.debug('Got CancelAllRequest')
@@ -182,9 +180,6 @@ async def _main(self) -> None:
                 except queue.Empty:
                     break
 
-            if do_sleep:
-                await asyncio.sleep(0.1)
-
             for name, process in list(self._state_tracker.processes.items()):
                 if process.status.is_finished():
                     _logger.debug(f'Reporting {name} done')
@@ -195,6 +190,9 @@ async def _main(self) -> None:
                 _logger.debug(f'Done: {self._state_tracker.processes}')
                 done = len(self._state_tracker.processes) == 0
 
+            if not done:
+                await asyncio.sleep(0.1)
+
         _logger.debug('Stopping executor')
         await self._executor.stop()
 

From a38e719fb8e2de07c18ffd3945537088df3e9cf9 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 16 Sep 2022 13:39:34 +0200
Subject: [PATCH 008/183] Create overloads for Instance.get_setting

Allows typecheckers (e.g. mypy) to deduce the correct type when setting
the typ argument.
---
 libmuscle/python/libmuscle/instance.py | 37 ++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index 24c10d5a..af84052e 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -2,7 +2,7 @@
 import logging
 import os
 import sys
-from typing import cast, Dict, List, Optional, Tuple
+from typing import Literal, cast, Dict, List, Optional, Tuple, overload
 
 from ymmsl import (Identifier, Operator, SettingValue, Port, Reference,
                    Settings)
@@ -157,6 +157,35 @@ def error_shutdown(self, message: str) -> None:
         """
         self.__shutdown(message)
 
+    @overload
+    def get_setting(self, name: str, typ: Literal['str']) -> str:
+        ...
+
+    @overload
+    def get_setting(self, name: str, typ: Literal['int']) -> int:
+        ...
+
+    @overload
+    def get_setting(self, name: str, typ: Literal['float']) -> float:
+        ...
+
+    @overload
+    def get_setting(self, name: str, typ: Literal['bool']) -> bool:
+        ...
+
+    @overload
+    def get_setting(self, name: str, typ: Literal['[float]']) -> List[float]:
+        ...
+
+    @overload
+    def get_setting(
+            self, name: str, typ: Literal['[[float]]']) -> List[List[float]]:
+        ...
+
+    @overload
+    def get_setting(self, name: str, typ: None = None) -> SettingValue:
+        ...
+
     def get_setting(self, name: str, typ: Optional[str] = None
                     ) -> SettingValue:
         """Returns the value of a model setting.
@@ -620,8 +649,7 @@ def _set_remote_log_level(self) -> None:
 
         """
         try:
-            log_level_str = cast(
-                    str, self.get_setting('muscle_remote_log_level', 'str'))
+            log_level_str = self.get_setting('muscle_remote_log_level', 'str')
         except KeyError:
             # muscle_remote_log_level not set, do nothing and keep the default
             return
@@ -656,8 +684,7 @@ def _set_local_log_level(self) -> None:
 
         """
         try:
-            log_level_str = cast(
-                    str, self.get_setting('muscle_local_log_level', 'str'))
+            log_level_str = self.get_setting('muscle_local_log_level', 'str')
 
             log_level = LogLevel[log_level_str.upper()]
             if log_level is None:

From 9ace491608632862fd42963582631b3022df36d2 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 16 Sep 2022 14:00:58 +0200
Subject: [PATCH 009/183] Import Literal from typing_extensions for py<3.8

---
 libmuscle/python/libmuscle/instance.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index af84052e..513018d6 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -2,7 +2,9 @@
 import logging
 import os
 import sys
-from typing import Literal, cast, Dict, List, Optional, Tuple, overload
+from typing import cast, Dict, List, Optional, Tuple, overload
+# TODO: import from typing module when dropping support for python 3.7
+from typing_extensions import Literal
 
 from ymmsl import (Identifier, Operator, SettingValue, Port, Reference,
                    Settings)

From c32e9fa4ebf01dfcc49d57c0c5d2e0bc1d169c0d Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 26 Aug 2022 11:23:45 +0200
Subject: [PATCH 010/183] Implement checkpoint triggers and tests

---
 .../python/libmuscle/snapshot_manager.py      | 151 ++++++++++++++++++
 .../libmuscle/test/test_snapshot_manager.py   | 133 +++++++++++++++
 2 files changed, 284 insertions(+)
 create mode 100644 libmuscle/python/libmuscle/snapshot_manager.py
 create mode 100644 libmuscle/python/libmuscle/test/test_snapshot_manager.py

diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py
new file mode 100644
index 00000000..36b742d8
--- /dev/null
+++ b/libmuscle/python/libmuscle/snapshot_manager.py
@@ -0,0 +1,151 @@
+import bisect
+from typing import List, Optional, Union
+
+from ymmsl import CheckpointRange, CheckpointRules
+
+
+class CheckpointTrigger:
+    """Represents a trigger for creating snapshots"""
+
+    def next_checkpoint(self, cur_time: float) -> Optional[float]:
+        """Calculate the next checkpoint time
+
+        Args:
+            cur_time: current time.
+
+        Returns:
+            The time when a next checkpoint should be taken, or None if this
+            trigger has no checkpoint after cur_time.
+        """
+        raise NotImplementedError()
+
+    def previous_checkpoint(self, cur_time: float) -> Optional[float]:
+        """Calculate the previous checkpoint time
+
+        Args:
+            cur_time: current time.
+
+        Returns:
+            The time when a previous checkpoint should have been taken, or None
+            if this trigger has no checkpoint after cur_time.
+        """
+        raise NotImplementedError()
+
+
+class AtCheckpointTrigger(CheckpointTrigger):
+    """Represents a trigger based on an "at" checkpoint rule
+
+    This triggers at the specified times.
+    """
+
+    def __init__(self, at: List[Union[float, int]]) -> None:
+        """Create an "at" checkpoint trigger
+
+        Args:
+            at: list of checkpoint moments
+        """
+        self._at = at
+        self._at.sort()  # ymmsl already sorts, but just to be sure
+
+    def next_checkpoint(self, cur_time: float) -> Optional[float]:
+        if cur_time >= self._at[-1]:
+            return None  # no future checkpoint left
+        idx = bisect.bisect(self._at, cur_time)
+        return self._at[idx]
+
+    def previous_checkpoint(self, cur_time: float) -> Optional[float]:
+        if cur_time < self._at[0]:
+            return None  # no previous checkpoint
+        idx = bisect.bisect(self._at, cur_time)
+        return self._at[idx - 1]
+
+
+class RangeCheckpointTrigger(CheckpointTrigger):
+    """Represents a trigger based on a "ranges" checkpoint rule
+
+    This triggers at a range of checkpoint moments.
+
+    Equivalent an "at" rule ``[start, start + step, start + 2*step, ...]`` for
+    as long as ``start + i*step <= stop``.
+
+    Stop may be omitted, in which case the range is infinite.
+
+    Start may be omitted, in which case the range is equivalent to an "at" rule
+    ``[..., -n*step, ..., -step, 0, step, 2*step, ...]`` for as long as
+    ``i*step <= stop``.
+
+    Note: the "every" rule is a special case of a range with start and stop
+    omitted, and is handled by this class as well
+    """
+
+    def __init__(self, range: CheckpointRange) -> None:
+        """Create a range of checkpoints
+
+        Args:
+            range: checkpoint range defining start, stop and step.
+        """
+        self._start = range.start
+        self._stop = range.stop
+        self._step = range.step
+        self._last = None  # type: Union[int, float, None]
+        if self._stop is not None:
+            start = 0 if self._start is None else self._start
+            diff = self._stop - start
+            self._last = start + (diff // self._step) * self._step
+
+    def next_checkpoint(self, cur_time: float) -> Optional[float]:
+        if self._start is not None and cur_time < self._start:
+            return float(self._start)
+        if self._last is not None and cur_time >= self._last:
+            return None
+        start = 0 if self._start is None else self._start
+        diff = cur_time - start
+        return float(start + (diff // self._step + 1) * self._step)
+
+    def previous_checkpoint(self, cur_time: float) -> Optional[float]:
+        if self._start is not None and cur_time < self._start:
+            return None
+        if self._last is not None and cur_time > self._last:
+            return float(self._last)
+        start = 0 if self._start is None else self._start
+        diff = cur_time - start
+        return float(start + (diff // self._step) * self._step)
+
+
+class CombinedCheckpointTriggers(CheckpointTrigger):
+    """Checkpoint trigger based on a combination of "every", "at" and "ranges"
+    """
+
+    def __init__(self, checkpoint_rules: CheckpointRules) -> None:
+        """Create a new combined checkpoint trigger from the given rules
+
+        Args:
+            checkpoint_rules: checkpoint rules (from ymmsl) defining "every",
+                "at", and/or "ranges" rules
+        """
+        self._triggers = []  # type: List[CheckpointTrigger]
+        if checkpoint_rules.every is not None:
+            cp_range = CheckpointRange(step=checkpoint_rules.every)
+            self._triggers.append(RangeCheckpointTrigger(cp_range))
+        if checkpoint_rules.at:
+            self._triggers.append(AtCheckpointTrigger(checkpoint_rules.at))
+        for cp_range in checkpoint_rules.ranges:
+            self._triggers.append(RangeCheckpointTrigger(cp_range))
+
+    def next_checkpoint(self, cur_time: float) -> Optional[float]:
+        checkpoints = (trigger.next_checkpoint(cur_time)
+                       for trigger in self._triggers)
+        # return earliest of all not-None next-checkpoints
+        return min((checkpoint
+                    for checkpoint in checkpoints
+                    if checkpoint is not None),
+                   default=None)  # return None if all triggers return None
+
+    def previous_checkpoint(self, cur_time: float) -> Optional[float]:
+        checkpoints = (trigger.previous_checkpoint(cur_time)
+                       for trigger in self._triggers)
+        # return latest of all not-None previous-checkpoints
+        return max((checkpoint
+                    for checkpoint in checkpoints
+                    if checkpoint is not None),
+                   default=None)  # return None if all triggers return None
diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
new file mode 100644
index 00000000..b557e5b8
--- /dev/null
+++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
@@ -0,0 +1,133 @@
+import pytest
+from ymmsl import CheckpointRange, CheckpointRules
+
+from libmuscle.snapshot_manager import (
+    CombinedCheckpointTriggers, AtCheckpointTrigger, RangeCheckpointTrigger)
+
+
+def test_at_checkpoint_trigger():
+    trigger = AtCheckpointTrigger([1, 3, 4, 4.5, 9])
+
+    assert trigger.next_checkpoint(0) == 1
+    assert trigger.previous_checkpoint(0) is None
+
+    assert trigger.next_checkpoint(1) == 3
+    assert trigger.previous_checkpoint(1) == 1
+
+    eps = 1e-16
+    assert trigger.next_checkpoint(1 - eps) == 1
+    assert trigger.previous_checkpoint(1 - eps) is None
+
+    assert trigger.next_checkpoint(3.9) == 4
+    assert trigger.previous_checkpoint(3.9) == 3
+
+    assert trigger.next_checkpoint(4.1) == 4.5
+    assert trigger.previous_checkpoint(4.1) == 4
+
+    assert trigger.next_checkpoint(5) == 9
+    assert trigger.previous_checkpoint(5) == 4.5
+
+    assert trigger.next_checkpoint(9) is None
+    assert trigger.previous_checkpoint(9) == 9
+
+    assert trigger.next_checkpoint(11) is None
+    assert trigger.previous_checkpoint(11) == 9
+
+
+def test_range_checkpoint_trigger():
+    range = CheckpointRange(start=0, stop=20, step=1.2)
+    trigger = RangeCheckpointTrigger(range)
+
+    assert trigger.next_checkpoint(-1) == 0
+    assert trigger.previous_checkpoint(-1) is None
+
+    assert trigger.next_checkpoint(0) == pytest.approx(1.2)
+    assert trigger.previous_checkpoint(0) == 0
+
+    assert trigger.next_checkpoint(8) == pytest.approx(8.4)
+    assert trigger.previous_checkpoint(8) == pytest.approx(7.2)
+
+    assert trigger.next_checkpoint(18.2) == pytest.approx(19.2)
+    assert trigger.previous_checkpoint(18.2) == pytest.approx(18)
+
+    assert trigger.next_checkpoint(20) is None
+    assert trigger.previous_checkpoint(20) == pytest.approx(19.2)
+
+
+def test_range_checkpoint_trigger_default_stop():
+    range = CheckpointRange(start=1, step=1.2)
+    trigger = RangeCheckpointTrigger(range)
+
+    assert trigger.next_checkpoint(-1.) == 1
+    assert trigger.previous_checkpoint(-1.) is None
+
+    assert trigger.next_checkpoint(148148.) == pytest.approx(148148.2)
+    assert trigger.previous_checkpoint(148148.) == pytest.approx(148147)
+
+    assert trigger.next_checkpoint(148148148.) == pytest.approx(148148149)
+    assert trigger.previous_checkpoint(148148148.) == pytest.approx(148148147.8)
+
+
+def test_range_checkpoint_trigger_default_start():
+    range = CheckpointRange(step=1.2, stop=10)
+    trigger = RangeCheckpointTrigger(range)
+
+    assert trigger.next_checkpoint(10) is None
+    assert trigger.previous_checkpoint(10) == pytest.approx(9.6)
+
+    assert trigger.next_checkpoint(0.0) == pytest.approx(1.2)
+    assert trigger.previous_checkpoint(0.0) == pytest.approx(0.0)
+
+    assert trigger.next_checkpoint(-148148.) == pytest.approx(-148147.2)
+    assert trigger.previous_checkpoint(-148148.) == pytest.approx(-148148.4)
+
+
+def test_combined_checkpoint_trigger_every_at():
+    rules = CheckpointRules(every=10, at=[3, 7, 13, 17])
+    trigger = CombinedCheckpointTriggers(rules)
+
+    assert trigger.next_checkpoint(-11.) == pytest.approx(-10)
+    assert trigger.previous_checkpoint(-11) == pytest.approx(-20)
+
+    assert trigger.next_checkpoint(0.) == pytest.approx(3)
+    assert trigger.previous_checkpoint(0.) == pytest.approx(0)
+
+    assert trigger.next_checkpoint(8.3) == pytest.approx(10)
+    assert trigger.previous_checkpoint(8.3) == pytest.approx(7)
+
+    assert trigger.next_checkpoint(14.2) == pytest.approx(17)
+    assert trigger.previous_checkpoint(14.2) == pytest.approx(13)
+
+    assert trigger.next_checkpoint(25.2) == pytest.approx(30)
+    assert trigger.previous_checkpoint(25.2) == pytest.approx(20)
+
+
+def test_combined_checkpoint_trigger_at_ranges():
+    rules = CheckpointRules(at=[3, 7, 13, 17], ranges=[
+                    CheckpointRange(start=0, step=5, stop=20),
+                    CheckpointRange(start=20, step=20, stop=100)])
+    trigger = CombinedCheckpointTriggers(rules)
+
+    assert trigger.next_checkpoint(-11.) == pytest.approx(0)
+    assert trigger.previous_checkpoint(-11) is None
+
+    assert trigger.next_checkpoint(0.) == pytest.approx(3)
+    assert trigger.previous_checkpoint(0.) == pytest.approx(0)
+
+    assert trigger.next_checkpoint(8.3) == pytest.approx(10)
+    assert trigger.previous_checkpoint(8.3) == pytest.approx(7)
+
+    assert trigger.next_checkpoint(14.2) == pytest.approx(15)
+    assert trigger.previous_checkpoint(14.2) == pytest.approx(13)
+
+    assert trigger.next_checkpoint(19.3) == pytest.approx(20)
+    assert trigger.previous_checkpoint(19.3) == pytest.approx(17)
+
+    assert trigger.next_checkpoint(25.2) == pytest.approx(40)
+    assert trigger.previous_checkpoint(25.2) == pytest.approx(20)
+
+    assert trigger.next_checkpoint(95.2) == pytest.approx(100)
+    assert trigger.previous_checkpoint(95.2) == pytest.approx(80)
+
+    assert trigger.next_checkpoint(125.2) is None
+    assert trigger.previous_checkpoint(125.2) == pytest.approx(100)

From 27bf3c21cbc253b72675132e8e91e4bf19510f21 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 26 Aug 2022 11:38:18 +0200
Subject: [PATCH 011/183] [tox] add ymmsl feature branch as dependency

---
 tox.ini | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tox.ini b/tox.ini
index 0e2a1348..23fb19f3 100644
--- a/tox.ini
+++ b/tox.ini
@@ -8,6 +8,7 @@ deps =
     flake8
     pytest
     pytest-cov
+    git+https://github.com/multiscale/ymmsl-python.git@feature/checkpointing#egg=ymmsl
 
 passenv =
     MUSCLE_TEST_PYTHON_ONLY

From a2e6a97909b7356dabc51c34fbd5155e9d0088f4 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 26 Aug 2022 16:35:05 +0200
Subject: [PATCH 012/183] Implement message counters on Port

---
 libmuscle/python/libmuscle/port.py | 70 +++++++++++++++++++++++++++++-
 1 file changed, 68 insertions(+), 2 deletions(-)

diff --git a/libmuscle/python/libmuscle/port.py b/libmuscle/python/libmuscle/port.py
index beb49249..9aced2d5 100644
--- a/libmuscle/python/libmuscle/port.py
+++ b/libmuscle/python/libmuscle/port.py
@@ -1,8 +1,20 @@
-from typing import List, Optional
+from typing import List, Optional, TypeVar
+
 from ymmsl import Identifier, Operator
 import ymmsl
 
 
+_T = TypeVar("_T")
+
+
+def _extend_list_to_size(lst: List[_T], size: int, padding: _T) -> None:
+    """When lst is smaller than size, extend to size using padding as values
+    """
+    num_extend = size - len(lst)
+    if num_extend > 0:
+        lst += [padding] * num_extend
+
+
 class Port(ymmsl.Port):
     """Represents a gateway to the outside world.
 
@@ -10,12 +22,18 @@ class Port(ymmsl.Port):
     an operator, as well as a set of dimensions that determines the
     valid slot indices for sending or receiving on this port.
 
+    Ports keep track of the amount of messages sent or received on the port.
+    However, the actual incrementing and validation is done in
+    :class:`Communicator`.
+
     Attributes:
         name (Identifier): Name of this port.
         operator (Operator): Operator associated with this port.
     """
+
     def __init__(self, name: str, operator: Operator, is_vector: bool,
-                 is_connected: bool, our_ndims: int, peer_dims: List[int]
+                 is_connected: bool, our_ndims: int, peer_dims: List[int],
+                 num_messages: Optional[List[int]] = None
                  ) -> None:
         """Create a Port.
 
@@ -68,6 +86,13 @@ def __init__(self, name: str, operator: Operator, is_vector: bool,
             self._is_open = [True]
 
         self._is_resizable = is_vector and (our_ndims == len(peer_dims))
+        self._num_messages = []  # type: List[int]
+        self._is_resuming = []  # type: List[bool]
+        if num_messages is not None:
+            self._num_messages = num_messages
+            self._is_resuming = [True] * len(num_messages)
+        _extend_list_to_size(self._num_messages, self._length or 1, 0)
+        _extend_list_to_size(self._is_resuming, self._length or 1, False)
 
     # Note: I'm not sure how this will develop exactly, so this class has some
     # accessors even if those are un-Pythonic; in the future a simple variable
@@ -129,6 +154,11 @@ def set_length(self, length: int) -> None:
         if length != self._length:
             self._length = length
             self._is_open = [True] * self._length
+            # Using extend here to not discard any information about message
+            # numbers between resizes. Note that _num_messages and _is_resuming
+            # may be longer than self._length!
+            _extend_list_to_size(self._num_messages, self._length, 0)
+            _extend_list_to_size(self._is_resuming, self._length, False)
 
     def set_closed(self, slot: Optional[int] = None) -> None:
         """Marks this port as closed.
@@ -137,3 +167,39 @@ def set_closed(self, slot: Optional[int] = None) -> None:
             self._is_open[slot] = False
         else:
             self._is_open = [False]
+
+    def increment_num_messages(self, slot: Optional[int] = None) -> None:
+        """Increment amount of messages sent or received.
+
+        Args:
+            slot: The slot that is sent/received on
+        """
+        self._num_messages[slot or 0] += 1
+        self.set_resumed(slot)
+
+    def get_num_messages(self, slot: Optional[int] = None) -> int:
+        """Get the amount of messages sent or received.
+
+        Args:
+            slot: The slot that is sent/received on
+        """
+        return self._num_messages[slot or 0]
+
+    def is_resuming(self, slot: Optional[int] = None) -> bool:
+        """True when this port has resumed.
+
+        After resumption, each port/slot may discard exactly one message.
+        is_resuming keeps track of this state.
+
+        Args:
+            slot: The slot that is sent/received on
+        """
+        return self._is_resuming[slot or 0]
+
+    def set_resumed(self, slot: Optional[int] = None) -> None:
+        """Mark that this port has resumed and may no longer discard messages.
+
+        Args:
+            slot: The slot that is sent/received on
+        """
+        self._is_resuming[slot or 0] = False

From 38168a324c5a3bb1d87d9a1519701d70c134f195 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 26 Aug 2022 16:48:06 +0200
Subject: [PATCH 013/183] Add message numbers to MMPMessage

---
 integration_test/test_cpp_mpp_client.py       |  2 +-
 libmuscle/python/libmuscle/communicator.py    | 48 +++++++++++++------
 libmuscle/python/libmuscle/mpp_message.py     |  7 ++-
 .../libmuscle/test/test_communicator.py       | 24 ++++++----
 .../python/libmuscle/test/test_mpp_message.py | 10 ++--
 .../python/libmuscle/test/test_outbox.py      |  1 +
 6 files changed, 62 insertions(+), 30 deletions(-)

diff --git a/integration_test/test_cpp_mpp_client.py b/integration_test/test_cpp_mpp_client.py
index a08e7bc4..7541993e 100644
--- a/integration_test/test_cpp_mpp_client.py
+++ b/integration_test/test_cpp_mpp_client.py
@@ -23,7 +23,7 @@ def tcp_server_process(control_pipe):
     message = MPPMessage(
             Reference('test_sender.test_port'),
             receiver,
-            10, 1.0, 2.0, settings, data).encoded()
+            10, 1.0, 2.0, settings, 0, data).encoded()
 
     def handle_request(request_bytes):
         request = msgpack.unpackb(request_bytes, raw=False)
diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py
index efdcb30d..af7e14bd 100644
--- a/libmuscle/python/libmuscle/communicator.py
+++ b/libmuscle/python/libmuscle/communicator.py
@@ -79,6 +79,7 @@ def __init__(self, kernel: Reference, index: List[int],
             profiler: The profiler to use for recording sends and
                     receives.
         """
+        # TODO: pass a SnapshotManager and store as self._snapshot_manager
         self._kernel = kernel
         self._index = index
         self._declared_ports = declared_ports
@@ -213,14 +214,16 @@ def send_message(
                 snd_endpoint.port, slot_list)
 
         port_length = None
-        if self._ports[port_name].is_resizable():
-            port_length = self._ports[port_name].get_length()
+        if port.is_resizable():
+            port_length = port.get_length()
 
         mcp_message = MPPMessage(snd_endpoint.ref(), recv_endpoint.ref(),
                                  port_length,
                                  message.timestamp, message.next_timestamp,
                                  cast(Settings, message.settings),
+                                 port.get_num_messages(slot),
                                  message.data)
+        port.increment_num_messages(slot)
         encoded_message = mcp_message.encoded()
         self._post_office.deposit(recv_endpoint.ref(), encoded_message)
         profile_event.stop()
@@ -257,12 +260,12 @@ def receive_message(self, port_name: str, slot: Optional[int] = None,
                 connected.
         """
         if slot is None:
-            _logger.debug('Waiting for message on {}'.format(port_name))
+            port_and_slot = port_name
             slot_list = []      # type: List[int]
         else:
-            _logger.debug('Waiting for message on {}[{}]'.format(
-                port_name, slot))
+            port_and_slot = f"{port_name}[{slot}]"
             slot_list = [slot]
+        _logger.debug('Waiting for message on {}'.format(port_and_slot))
 
         recv_endpoint = self.__get_endpoint(port_name, slot_list)
 
@@ -311,15 +314,26 @@ def receive_message(self, port_name: str, slot: Optional[int] = None,
             profile_event.port_length = port.get_length()
         profile_event.message_size = len(mcp_message_bytes)
 
-        if slot is None:
-            _logger.debug('Received message on {}'.format(port_name))
-            if isinstance(mcp_message.data, ClosePort):
-                _logger.debug('Port {} is now closed'.format(port_name))
-        else:
-            _logger.debug('Received message on {}[{}]'.format(port_name, slot))
-            if isinstance(mcp_message.data, ClosePort):
-                _logger.debug('Port {}[{}] is now closed'.format(
-                    port_name, slot))
+        expected_message_number = port.get_num_messages(slot)
+        # TODO: handle f_init port counts for STATELESS and WEAKLY_STATEFUL
+        # components which didn't load a snapshot
+        if expected_message_number != mcp_message.message_number:
+            if (expected_message_number - 1 == mcp_message.message_number and
+                    port.is_resuming(slot)):
+                _logger.debug(f'Discarding received message on {port_and_slot}'
+                              ': resuming from weakly consistent snapshot')
+                port.set_resumed()
+                return self.receive_message(port_name, slot, default)
+            raise RuntimeError(f'Received message on {port_and_slot} with'
+                               ' unexpected message number'
+                               f' {mcp_message.message_number}. Was expecting'
+                               f' {expected_message_number}. Are you resuming'
+                               ' from an inconsistent snapshot?')
+        port.increment_num_messages(slot)
+
+        _logger.debug('Received message on {}'.format(port_and_slot))
+        if isinstance(mcp_message.data, ClosePort):
+            _logger.debug('Port {} is now closed'.format(port_and_slot))
 
         return message
 
@@ -380,6 +394,8 @@ def __ports_from_declared(self) -> Dict[str, Port]:
                 ports[port_name] = Port(
                         port_name, operator, is_vector, is_connected,
                         len(self._index), port_peer_dims)
+                # TODO: retrieve num_messages[] for this port from
+                # self._snapshot_manager when resuming
         return ports
 
     def __ports_from_conduits(self, conduits: List[Conduit]
@@ -411,6 +427,8 @@ def __ports_from_conduits(self, conduits: List[Conduit]
                 ports[str(port_id)] = Port(
                         str(port_id), operator, is_vector, is_connected,
                         len(self._index), port_peer_dims)
+                # TODO: retrieve num_messages[] for this port from
+                # self._snapshot_manager when resuming
         return ports
 
     def __settings_in_port(self, conduits: List[Conduit]) -> Port:
@@ -430,6 +448,8 @@ def __settings_in_port(self, conduits: List[Conduit]) -> Port:
                                     conduit.sending_component()))
         return Port('muscle_settings_in', Operator.F_INIT, False, False,
                     len(self._index), [])
+        # TODO: retrieve num_messages[] for this port from
+        # self._snapshot_manager when resuming
 
     def __get_client(self, instance: Reference) -> MPPClient:
         """Get or create a client to connect to the given instance.
diff --git a/libmuscle/python/libmuscle/mpp_message.py b/libmuscle/python/libmuscle/mpp_message.py
index 15ff09f9..69ea4563 100644
--- a/libmuscle/python/libmuscle/mpp_message.py
+++ b/libmuscle/python/libmuscle/mpp_message.py
@@ -151,7 +151,7 @@ class MPPMessage:
     def __init__(self, sender: Reference, receiver: Reference,
                  port_length: Optional[int],
                  timestamp: float, next_timestamp: Optional[float],
-                 settings_overlay: Settings, data: Any
+                 settings_overlay: Settings, message_number: int, data: Any
                  ) -> None:
         """Create an MPPMessage.
 
@@ -177,6 +177,7 @@ def __init__(self, sender: Reference, receiver: Reference,
         self.timestamp = timestamp
         self.next_timestamp = next_timestamp
         self.settings_overlay = settings_overlay
+        self.message_number = message_number
         if isinstance(data, np.ndarray):
             self.data = Grid(data)
         else:
@@ -197,11 +198,12 @@ def from_bytes(message: bytes) -> 'MPPMessage':
         timestamp = message_dict["timestamp"]
         next_timestamp = message_dict["next_timestamp"]
         settings_overlay = message_dict["settings_overlay"]
+        message_number = message_dict["message_number"]
 
         data = message_dict["data"]
         return MPPMessage(
                 sender, receiver, port_length, timestamp, next_timestamp,
-                settings_overlay, data)
+                settings_overlay, message_number, data)
 
     def encoded(self) -> bytes:
         """Encode the message and return as a bytes buffer.
@@ -213,6 +215,7 @@ def encoded(self) -> bytes:
                 'timestamp': self.timestamp,
                 'next_timestamp': self.next_timestamp,
                 'settings_overlay': self.settings_overlay,
+                'message_number': self.message_number,
                 'data': self.data
                 }
 
diff --git a/libmuscle/python/libmuscle/test/test_communicator.py b/libmuscle/python/libmuscle/test/test_communicator.py
index 8f0f1238..a4f3a751 100644
--- a/libmuscle/python/libmuscle/test/test_communicator.py
+++ b/libmuscle/python/libmuscle/test/test_communicator.py
@@ -281,6 +281,7 @@ def test_send_message(communicator, message) -> None:
     assert msg.timestamp == 0.0
     assert msg.next_timestamp is None
     assert msg.settings_overlay == Settings()
+    assert msg.message_number == 0
     assert msg.data == b'test'
 
 
@@ -304,6 +305,7 @@ def test_send_msgpack(communicator, message2) -> None:
     assert msg.sender == 'kernel[13].out'
     assert msg.receiver == 'other.in[13]'
     assert msg.settings_overlay == Settings()
+    assert msg.message_number == 0
     assert msg.data == {'test': 17}
 
 
@@ -318,6 +320,7 @@ def test_send_message_with_slot(communicator2, message) -> None:
     assert msg.sender == 'other.out[13]'
     assert msg.receiver == 'kernel[13].in'
     assert msg.settings_overlay == Settings()
+    assert msg.message_number == 0
     assert msg.data == b'test'
 
 
@@ -348,6 +351,7 @@ def test_send_message_with_settings(communicator, message) -> None:
     assert msg.sender == 'kernel[13].out'
     assert msg.receiver == 'other.in[13]'
     assert msg.settings_overlay.as_ordered_dict() == {'test2': 'testing'}
+    assert msg.message_number == 0
     assert msg.data == b'test'
 
 
@@ -363,6 +367,7 @@ def test_send_settings(communicator, message) -> None:
     assert msg.sender == 'kernel[13].out'
     assert msg.receiver == 'other.in[13]'
     assert msg.settings_overlay == Settings()
+    assert msg.message_number == 0
     assert msg.data == Settings({'test1': 'testing'})
 
 
@@ -378,6 +383,7 @@ def test_close_port(communicator) -> None:
     assert msg.timestamp == float('inf')
     assert msg.next_timestamp is None
     assert msg.settings_overlay == Settings()
+    assert msg.message_number == 0
     assert isinstance(msg.data, ClosePort)
 
 
@@ -385,7 +391,7 @@ def test_receive_message(communicator) -> None:
     client_mock = MagicMock()
     client_mock.receive.return_value = MPPMessage(
             Reference('other.out[13]'), Reference('kernel[13].in'),
-            None, 0.0, None, Settings({'test1': 12}),
+            None, 0.0, None, Settings({'test1': 12}), 0,
             b'test').encoded()
     get_client_mock = MagicMock(return_value=client_mock)
     communicator._Communicator__get_client = get_client_mock
@@ -424,7 +430,7 @@ def test_receive_msgpack(communicator) -> None:
     client_mock = MagicMock()
     client_mock.receive.return_value = MPPMessage(
             Reference('other.out[13]'), Reference('kernel[13].in'),
-            None, 0.0, None, Settings({'test1': 12}),
+            None, 0.0, None, Settings({'test1': 12}), 0,
             {'test': 13}).encoded()
     get_client_mock = MagicMock(return_value=client_mock)
     communicator._Communicator__get_client = get_client_mock
@@ -441,7 +447,7 @@ def test_receive_with_slot(communicator2) -> None:
     client_mock = MagicMock()
     client_mock.receive.return_value = MPPMessage(
             Reference('kernel[13].out'), Reference('other.in[13]'),
-            None, 0.0, None, Settings({'test': 'testing'}),
+            None, 0.0, None, Settings({'test': 'testing'}), 0,
             b'test').encoded()
     get_client_mock = MagicMock(return_value=client_mock)
     communicator2._Communicator__get_client = get_client_mock
@@ -459,7 +465,7 @@ def test_receive_message_resizable(communicator3) -> None:
     client_mock = MagicMock()
     client_mock.receive.return_value = MPPMessage(
             Reference('other.out[13]'), Reference('kernel.in[13]'),
-            20, 0.0, None, Settings({'test': 'testing'}),
+            20, 0.0, None, Settings({'test': 'testing'}), 0,
             b'test').encoded()
     get_client_mock = MagicMock(return_value=client_mock)
     communicator3._Communicator__get_client = get_client_mock
@@ -477,7 +483,7 @@ def test_receive_with_settings(communicator) -> None:
     client_mock = MagicMock()
     client_mock.receive.return_value = MPPMessage(
             Reference('other.out[13]'), Reference('kernel[13].in'),
-            None, 0.0, None, Settings({'test2': 3.1}),
+            None, 0.0, None, Settings({'test2': 3.1}), 0,
             b'test').encoded()
     get_client_mock = MagicMock(return_value=client_mock)
     communicator._Communicator__get_client = get_client_mock
@@ -496,7 +502,7 @@ def test_receive_msgpack_with_slot_and_settings(communicator2) -> None:
     client_mock.receive.return_value = MPPMessage(
             Reference('kernel[13].out'), Reference('other.in[13]'),
             None, 0.0, 1.0,
-            Settings({'test': 'testing'}), 'test').encoded()
+            Settings({'test': 'testing'}), 0, 'test').encoded()
     get_client_mock = MagicMock(return_value=client_mock)
     communicator2._Communicator__get_client = get_client_mock
     communicator2._profiler = MagicMock()
@@ -513,7 +519,7 @@ def test_receive_settings(communicator) -> None:
     client_mock = MagicMock()
     client_mock.receive.return_value = MPPMessage(
             Reference('other.out[13]'), Reference('kernel[13].in'),
-            None, 0.0, None, Settings({'test1': 12}),
+            None, 0.0, None, Settings({'test1': 12}), 0,
             Settings({'test': 13})).encoded()
     get_client_mock = MagicMock(return_value=client_mock)
     communicator._Communicator__get_client = get_client_mock
@@ -531,7 +537,7 @@ def test_receive_close_port(communicator) -> None:
     client_mock = MagicMock()
     client_mock.receive.return_value = MPPMessage(
             Reference('other.out[13]'), Reference('kernel[13].in'),
-            None, 0.0, None, Settings(), ClosePort()).encoded()
+            None, 0.0, None, Settings(), 0, ClosePort()).encoded()
     get_client_mock = MagicMock(return_value=client_mock)
     communicator._Communicator__get_client = get_client_mock
     communicator._profiler = MagicMock()
@@ -545,6 +551,6 @@ def test_get_message(communicator, message) -> None:
     communicator.send_message('out', message)
     ref_message = MPPMessage(
             Reference('kernel[13].out'), Reference('other.in[13]'),
-            None, 0.0, None, Settings(), b'test').encoded()
+            None, 0.0, None, Settings(), 0, b'test').encoded()
     assert communicator._post_office.get_message(
             'other.in[13]') == ref_message
diff --git a/libmuscle/python/libmuscle/test/test_mpp_message.py b/libmuscle/python/libmuscle/test/test_mpp_message.py
index 79ee8ee6..dce3ed88 100644
--- a/libmuscle/python/libmuscle/test/test_mpp_message.py
+++ b/libmuscle/python/libmuscle/test/test_mpp_message.py
@@ -18,13 +18,14 @@ def test_create() -> None:
     data = (12345).to_bytes(2, 'little', signed=True)
     msg = MPPMessage(
             sender, receiver, None, timestamp, next_timestamp,
-            settings_overlay, data)
+            settings_overlay, 0, data)
     assert msg.sender == sender
     assert msg.receiver == receiver
     assert msg.port_length is None
     assert msg.timestamp == 10.0
     assert msg.next_timestamp == 11.0
     assert msg.settings_overlay == settings_overlay
+    assert msg.message_number == 0
     assert msg.data == data
 
 
@@ -43,7 +44,7 @@ def test_grid_encode() -> None:
     grid = Grid(array, ['x', 'y', 'z'])
     msg = MPPMessage(
             sender, receiver, None, timestamp, next_timestamp, Settings(),
-            grid)
+            0, grid)
 
     wire_data = msg.encoded()
     mcp_decoded = msgpack.unpackb(wire_data, raw=False)
@@ -86,6 +87,7 @@ def test_grid_decode() -> None:
             'timestamp': 0.0,
             'next_timestamp': None,
             'settings_overlay': msgpack.ExtType(1, settings_data),
+            'message_number': 0,
             'data': msgpack.ExtType(2, grid_data)}
 
     wire_data = msgpack.packb(msg_dict, use_bin_type=True)
@@ -135,7 +137,7 @@ def test_grid_roundtrip() -> None:
         grid = Grid(array, ['x', 'y', 'z'])
         msg = MPPMessage(
                 sender, receiver, None, timestamp, next_timestamp, Settings(),
-                grid)
+                0, grid)
 
         wire_data = msg.encoded()
         msg_out = MPPMessage.from_bytes(wire_data)
@@ -169,7 +171,7 @@ def test_non_contiguous_grid_roundtrip() -> None:
     grid = Grid(array.real, ['a', 'b', 'c'])
     msg = MPPMessage(
             sender, receiver, None, timestamp, next_timestamp, Settings(),
-            grid)
+            0, grid)
 
     wire_data = msg.encoded()
     msg_out = MPPMessage.from_bytes(wire_data)
diff --git a/libmuscle/python/libmuscle/test/test_outbox.py b/libmuscle/python/libmuscle/test/test_outbox.py
index 6b22f068..cb4af31a 100644
--- a/libmuscle/python/libmuscle/test/test_outbox.py
+++ b/libmuscle/python/libmuscle/test/test_outbox.py
@@ -19,6 +19,7 @@ def message():
             Ref('sender.out'), Ref('receiver.in'),
             None, 0.0, 1.0,
             bytes(),
+            0,
             'testing'.encode('utf-8'))
 
 

From 5764f9e4155c9831f08e54420fa7e77c132fc234 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 29 Aug 2022 09:49:40 +0200
Subject: [PATCH 014/183] Implement message counters on Port (C++)

---
 libmuscle/cpp/src/libmuscle/port.cpp | 61 ++++++++++++++++++++++++++-
 libmuscle/cpp/src/libmuscle/port.hpp | 63 +++++++++++++++++++++++++++-
 2 files changed, 122 insertions(+), 2 deletions(-)

diff --git a/libmuscle/cpp/src/libmuscle/port.cpp b/libmuscle/cpp/src/libmuscle/port.cpp
index 691c75cf..2cb119ce 100644
--- a/libmuscle/cpp/src/libmuscle/port.cpp
+++ b/libmuscle/cpp/src/libmuscle/port.cpp
@@ -11,12 +11,26 @@ using ymmsl::Identifier;
 using ymmsl::Operator;
 
 
+namespace {
+
+template< typename T>
+inline void extend_vector_to_size(
+        std::vector<T> &vec, const int minsize, const T &val) {
+    if(static_cast<int>(vec.size()) < minsize) {
+        vec.resize(minsize, val);
+    }
+}
+
+}
+
+
 namespace libmuscle { namespace impl {
 
 Port::Port(
         std::string const & name, Operator oper,
         bool is_vector, bool is_connected,
-        int our_ndims, std::vector<int> peer_dims)
+        int our_ndims, std::vector<int> peer_dims,
+        std::vector<int> num_messages)
     : ::ymmsl::Port(Identifier(name), oper)
 {
     is_connected_ = is_connected;
@@ -53,6 +67,12 @@ Port::Port(
     }
 
     is_resizable_ = is_vector && (our_ndims == static_cast<int>(peer_dims.size()));
+    if (!num_messages.empty()) {
+        num_messages_ = num_messages;
+        is_resuming_.resize(num_messages_.size(), true);
+    }
+    extend_vector_to_size(num_messages_, std::min(1, length_), 0);
+    extend_vector_to_size(is_resuming_, std::min(1, length_), false);
 }
 
 bool Port::is_connected() const {
@@ -94,6 +114,11 @@ void Port::set_length(int length) {
     if (length != length_) {
         length_ = length;
         is_open_ = std::vector<bool>(length_, true);
+        // Using extend here to not discard any information about message
+        // numbers between resizes. Note that _num_messages and _is_resuming
+        // may be longer than self._length!
+        extend_vector_to_size(num_messages_, std::min(1, length_), 0);
+        extend_vector_to_size(is_resuming_, std::min(1, length_), false);
     }
 }
 
@@ -105,5 +130,39 @@ void Port::set_closed(int slot) {
     is_open_[slot] = false;
 }
 
+void Port::increment_num_messages() {
+    num_messages_[0] ++;
+    set_resumed();
+}
+
+void Port::increment_num_messages(int slot) {
+    num_messages_[slot] ++;
+    set_resumed(slot);
+}
+
+int Port::get_num_messages() const {
+    return num_messages_[0];
+}
+
+int Port::get_num_messages(int slot) const {
+    return num_messages_[slot];
+}
+
+bool Port::is_resuming() const {
+    return is_resuming_[0];
+}
+
+bool Port::is_resuming(int slot) const {
+    return is_resuming_[slot];
+}
+
+void Port::set_resumed() {
+    is_resuming_[0] = false;
+}
+
+void Port::set_resumed(int slot) {
+    is_resuming_[slot] = false;
+}
+
 } }
 
diff --git a/libmuscle/cpp/src/libmuscle/port.hpp b/libmuscle/cpp/src/libmuscle/port.hpp
index 41bf8898..e0b6c61c 100644
--- a/libmuscle/cpp/src/libmuscle/port.hpp
+++ b/libmuscle/cpp/src/libmuscle/port.hpp
@@ -11,6 +11,10 @@ namespace libmuscle { namespace impl {
  * Ports can be used to send or receive messages. They have a name and an
  * operator, as well as a set of dimensions that determines the valid slot
  * indices for sending or receiving on this port.
+ *
+ * Ports keep track of the amount of messages sent or received on the port.
+ * However, the actual incrementing and validation is done in
+ * Communicator.
  */
 class Port : public ::ymmsl::Port {
     public:
@@ -26,7 +30,8 @@ class Port : public ::ymmsl::Port {
         Port(
                 std::string const & name, ::ymmsl::Operator oper,
                 bool is_vector, bool is_connected,
-                int our_ndims, std::vector<int> peer_dims);
+                int our_ndims, std::vector<int> peer_dims,
+                std::vector<int> num_messages=std::vector<int>(0));
 
         // Note: we only ever use this Port in libmuscle, and only use
         // ymmsl::Port in ymmsl. Port objects are always handled by value, so
@@ -104,11 +109,67 @@ class Port : public ::ymmsl::Port {
          */
         void set_closed(int slot);
 
+        /** Increment amount of messages sent or received.
+         */
+        void increment_num_messages();
+
+        /** Increment amount of messages sent or received.
+         *
+         * Only valid for vector ports.
+         *
+         * @param slot The slot that is sent/received on
+         */
+        void increment_num_messages(int slot);
+
+        /** Get the amount of messages sent or received
+         */
+        int get_num_messages() const;
+
+        /** Get the amount of messages sent or received
+         *
+         * Only valid for vector ports.
+         *
+         * @param slot The slot that is sent/received on
+         */
+        int get_num_messages(int slot) const;
+
+        /** True when this port has resumed.
+         *
+         * After resumption, each port/slot may discard exactly one message.
+         * is_resuming keeps track of this state.
+         */
+        bool is_resuming() const;
+
+        /** True when this port has resumed.
+         *
+         * After resumption, each port/slot may discard exactly one message.
+         * is_resuming keeps track of this state.
+         *
+         * Only valid for vector ports.
+         *
+         * @param slot The slot that is sent/received on
+         */
+        bool is_resuming(int slot) const;
+
+        /** Mark that this port has resumed and may no longer discard messages.
+         */
+        void set_resumed();
+
+        /** Mark that this port has resumed and may no longer discard messages.
+         *
+         * Only valid for vector ports.
+         *
+         * @param slot The slot that is sent/received on
+         */
+        void set_resumed(int slot);
+
     private:
         bool is_connected_;
         int length_;
         bool is_resizable_;
         std::vector<bool> is_open_;
+        std::vector<int> num_messages_;
+        std::vector<bool> is_resuming_;
 };
 
 } }

From c702220acfbd7eb16f1838c0c7ff410ac4c28a65 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 29 Aug 2022 10:56:07 +0200
Subject: [PATCH 015/183] Fix bugs and add Optional call signatures in Port

---
 libmuscle/cpp/src/libmuscle/port.cpp | 36 ++++++++++++++++++++++++----
 libmuscle/cpp/src/libmuscle/port.hpp | 27 +++++++++++++++++++++
 2 files changed, 59 insertions(+), 4 deletions(-)

diff --git a/libmuscle/cpp/src/libmuscle/port.cpp b/libmuscle/cpp/src/libmuscle/port.cpp
index 2cb119ce..c4e7b4f9 100644
--- a/libmuscle/cpp/src/libmuscle/port.cpp
+++ b/libmuscle/cpp/src/libmuscle/port.cpp
@@ -71,8 +71,8 @@ Port::Port(
         num_messages_ = num_messages;
         is_resuming_.resize(num_messages_.size(), true);
     }
-    extend_vector_to_size(num_messages_, std::min(1, length_), 0);
-    extend_vector_to_size(is_resuming_, std::min(1, length_), false);
+    extend_vector_to_size(num_messages_, std::max(1, length_), 0);
+    extend_vector_to_size(is_resuming_, std::max(1, length_), false);
 }
 
 bool Port::is_connected() const {
@@ -117,8 +117,8 @@ void Port::set_length(int length) {
         // Using extend here to not discard any information about message
         // numbers between resizes. Note that _num_messages and _is_resuming
         // may be longer than self._length!
-        extend_vector_to_size(num_messages_, std::min(1, length_), 0);
-        extend_vector_to_size(is_resuming_, std::min(1, length_), false);
+        extend_vector_to_size(num_messages_, std::max(1, length_), 0);
+        extend_vector_to_size(is_resuming_, std::max(1, length_), false);
     }
 }
 
@@ -140,6 +140,13 @@ void Port::increment_num_messages(int slot) {
     set_resumed(slot);
 }
 
+void Port::increment_num_messages(Optional<int> slot) {
+    if(slot.is_set())
+        increment_num_messages(slot.get());
+    else
+        increment_num_messages();
+}
+
 int Port::get_num_messages() const {
     return num_messages_[0];
 }
@@ -148,6 +155,13 @@ int Port::get_num_messages(int slot) const {
     return num_messages_[slot];
 }
 
+int Port::get_num_messages(Optional<int> slot) const {
+    if(slot.is_set())
+        return get_num_messages(slot.get());
+    else
+        return get_num_messages();
+}
+
 bool Port::is_resuming() const {
     return is_resuming_[0];
 }
@@ -156,6 +170,13 @@ bool Port::is_resuming(int slot) const {
     return is_resuming_[slot];
 }
 
+bool Port::is_resuming(Optional<int> slot) const {
+    if(slot.is_set())
+        return is_resuming(slot.get());
+    else
+        return is_resuming();
+}
+
 void Port::set_resumed() {
     is_resuming_[0] = false;
 }
@@ -164,5 +185,12 @@ void Port::set_resumed(int slot) {
     is_resuming_[slot] = false;
 }
 
+void Port::set_resumed(Optional<int> slot) {
+    if(slot.is_set())
+        set_resumed(slot.get());
+    else
+        set_resumed();
+}
+
 } }
 
diff --git a/libmuscle/cpp/src/libmuscle/port.hpp b/libmuscle/cpp/src/libmuscle/port.hpp
index e0b6c61c..913a1917 100644
--- a/libmuscle/cpp/src/libmuscle/port.hpp
+++ b/libmuscle/cpp/src/libmuscle/port.hpp
@@ -121,6 +121,12 @@ class Port : public ::ymmsl::Port {
          */
         void increment_num_messages(int slot);
 
+        /** Increment amount of messages sent or received.
+         *
+         * @param slot The slot that is sent/received on
+         */
+        void increment_num_messages(Optional<int> slot);
+
         /** Get the amount of messages sent or received
          */
         int get_num_messages() const;
@@ -133,6 +139,12 @@ class Port : public ::ymmsl::Port {
          */
         int get_num_messages(int slot) const;
 
+        /** Get the amount of messages sent or received
+         *
+         * @param slot The slot that is sent/received on
+         */
+        int get_num_messages(Optional<int> slot) const;
+
         /** True when this port has resumed.
          *
          * After resumption, each port/slot may discard exactly one message.
@@ -151,6 +163,15 @@ class Port : public ::ymmsl::Port {
          */
         bool is_resuming(int slot) const;
 
+        /** True when this port has resumed.
+         *
+         * After resumption, each port/slot may discard exactly one message.
+         * is_resuming keeps track of this state.
+         *
+         * @param slot The slot that is sent/received on
+         */
+        bool is_resuming(Optional<int> slot) const;
+
         /** Mark that this port has resumed and may no longer discard messages.
          */
         void set_resumed();
@@ -163,6 +184,12 @@ class Port : public ::ymmsl::Port {
          */
         void set_resumed(int slot);
 
+        /** Mark that this port has resumed and may no longer discard messages.
+         *
+         * @param slot The slot that is sent/received on
+         */
+        void set_resumed(Optional<int> slot);
+
     private:
         bool is_connected_;
         int length_;

From bb0a1e12b5c31cd7c07289a8584f99631cfc5116 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 29 Aug 2022 10:57:54 +0200
Subject: [PATCH 016/183] Add message numbers to MMPMessage (C++)

---
 libmuscle/cpp/src/libmuscle/communicator.cpp  | 36 ++++++++++++++++---
 libmuscle/cpp/src/libmuscle/mpp_message.cpp   |  4 +++
 libmuscle/cpp/src/libmuscle/mpp_message.hpp   |  3 +-
 .../libmuscle/tests/mocks/mock_mpp_client.cpp |  2 +-
 .../tests/mocks/mock_post_office.cpp          |  4 +--
 .../tests/tcp_transport_server_test.cpp       |  1 +
 .../src/libmuscle/tests/test_mpp_message.cpp  |  8 +++--
 .../cpp/src/libmuscle/tests/test_outbox.cpp   |  1 +
 .../src/libmuscle/tests/test_post_office.cpp  |  2 +-
 .../tests/test_tcp_communication.cpp          |  3 +-
 10 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/libmuscle/cpp/src/libmuscle/communicator.cpp b/libmuscle/cpp/src/libmuscle/communicator.cpp
index 4e9c5139..b74b4497 100644
--- a/libmuscle/cpp/src/libmuscle/communicator.cpp
+++ b/libmuscle/cpp/src/libmuscle/communicator.cpp
@@ -116,7 +116,7 @@ void Communicator::send_message(
         // log sending on disconnected port
         return;
 
-    // Port const & port = ports_.at(port_name);
+    Port & port = ports_.at(port_name);
 
     // TODO start profile event
 
@@ -126,13 +126,14 @@ void Communicator::send_message(
     Data settings_overlay(message.settings());
 
     Optional<int> port_length;
-    if (ports_.at(port_name).is_resizable())
-        port_length = ports_.at(port_name).get_length();
+    if (port.is_resizable())
+        port_length = port.get_length();
 
     MPPMessage mpp_message(
             snd_endpoint.ref(), recv_endpoint.ref(),
             port_length, message.timestamp(), Optional<double>(),
-            settings_overlay, message.data());
+            settings_overlay, port.get_num_messages(slot), message.data());
+    port.increment_num_messages(slot);
 
     if (message.has_next_timestamp())
         mpp_message.next_timestamp = message.next_timestamp();
@@ -204,6 +205,33 @@ Message Communicator::receive_message(
 
     // TODO stop and finalise profile event
 
+    int expected_message_number = port.get_num_messages(slot);
+    // TODO: handle f_init port counts for STATELESS and WEAKLY_STATEFUL
+    // components which didn't load a snapshot
+    if (expected_message_number != mpp_message.message_number) {
+        if (expected_message_number - 1 == mpp_message.message_number and
+                port.is_resuming(slot)) {
+            if (slot.is_set())
+                logger_.debug("Discarding received message on ", port_name,
+                              "[", slot.get(), "]: resuming from weakly",
+                              " constistent snapshot");
+            else
+                logger_.debug("Discarding received message on ", port_name,
+                              ": resuming from weakly constistent snapshot");
+            port.set_resumed(slot);
+            return receive_message(port_name, slot, default_msg);
+        }
+        std::ostringstream oss;
+        oss << "Received message on " << port_name;
+        if (slot.is_set())
+            oss << "[" << slot.get() << "]";
+        oss << " with unexpected message number " << mpp_message.message_number;
+        oss << ". Was expecting " << expected_message_number;
+        oss << ". Are you resuming from an inconsistent snapshot?";
+        throw std::runtime_error(oss.str());
+    }
+    port.increment_num_messages(slot);
+
     if (slot.is_set())
         logger_.debug("Received message on ", port_name, "[", slot.get(), "]");
     else
diff --git a/libmuscle/cpp/src/libmuscle/mpp_message.cpp b/libmuscle/cpp/src/libmuscle/mpp_message.cpp
index 2962e31c..bf1be0f0 100644
--- a/libmuscle/cpp/src/libmuscle/mpp_message.cpp
+++ b/libmuscle/cpp/src/libmuscle/mpp_message.cpp
@@ -16,6 +16,7 @@ MPPMessage::MPPMessage(
             ::libmuscle::impl::Optional<int> port_length,
             double timestamp, ::libmuscle::impl::Optional<double> next_timestamp,
             DataConstRef const & settings_overlay,
+            int message_number,
             DataConstRef const & data
             )
         : sender(sender)
@@ -24,6 +25,7 @@ MPPMessage::MPPMessage(
         , timestamp(timestamp)
         , next_timestamp(next_timestamp)
         , settings_overlay(settings_overlay)
+        , message_number(message_number)
         , data(data)
     {}
 
@@ -48,6 +50,7 @@ MPPMessage MPPMessage::from_bytes(DataConstRef const & data) {
             dict["timestamp"].as<double>(),
             next_timestamp,
             dict["settings_overlay"],
+            dict["message_number"].as<int>(),
             dict["data"]);
 }
 
@@ -67,6 +70,7 @@ DataConstRef MPPMessage::encoded() const {
             "timestamp", timestamp,
             "next_timestamp", next_timestamp_data,
             "settings_overlay", settings_overlay,
+            "message_number", message_number,
             "data", data
             );
 
diff --git a/libmuscle/cpp/src/libmuscle/mpp_message.hpp b/libmuscle/cpp/src/libmuscle/mpp_message.hpp
index 50e8a49b..96a26fe0 100644
--- a/libmuscle/cpp/src/libmuscle/mpp_message.hpp
+++ b/libmuscle/cpp/src/libmuscle/mpp_message.hpp
@@ -32,7 +32,7 @@ struct MPPMessage {
             ::ymmsl::Reference const & sender, ::ymmsl::Reference const & receiver,
             ::libmuscle::impl::Optional<int> port_length,
             double timestamp, ::libmuscle::impl::Optional<double> next_timestamp,
-            DataConstRef const & settings_overlay,
+            DataConstRef const & settings_overlay, int message_number,
             DataConstRef const & data);
 
     /** Create an MCP Message from an encoded buffer.
@@ -53,6 +53,7 @@ struct MPPMessage {
     double timestamp;
     ::libmuscle::impl::Optional<double> next_timestamp;
     DataConstRef settings_overlay;
+    int message_number;
     DataConstRef data;
 };
 
diff --git a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mpp_client.cpp b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mpp_client.cpp
index c23dbafc..55ae3a76 100644
--- a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mpp_client.cpp
+++ b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mpp_client.cpp
@@ -44,7 +44,7 @@ Settings MockMPPClient::make_overlay_() {
 }
 
 MPPMessage MockMPPClient::next_receive_message(
-        "test.out", "test2.in", 0, 0.0, 1.0, make_overlay_(), Data::dict("test1", 12));
+        "test.out", "test2.in", 0, 0.0, 1.0, make_overlay_(), 0, Data::dict("test1", 12));
 
 Reference MockMPPClient::last_receiver("_none");
 
diff --git a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_post_office.cpp b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_post_office.cpp
index e1d66eac..6d2bb3cc 100644
--- a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_post_office.cpp
+++ b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_post_office.cpp
@@ -6,13 +6,13 @@ int MockPostOffice::handle_request(
         char const * res_buf, std::size_t res_len,
         std::unique_ptr<DataConstRef> & response) {
     response = std::make_unique<DataConstRef>(
-            MPPMessage("test.out", "test2.in", 0, 0.0, 1.0, Data(), Data()).encoded());
+            MPPMessage("test.out", "test2.in", 0, 0.0, 1.0, Data(), 0, Data()).encoded());
     return -1;
 }
 
 std::unique_ptr<DataConstRef> MockPostOffice::get_response(int fd) {
     return std::make_unique<DataConstRef>(
-            MPPMessage("test.out", "test2.in", 0, 0.0, 1.0, Data(), Data()).encoded());
+            MPPMessage("test.out", "test2.in", 0, 0.0, 1.0, Data(), 0, Data()).encoded());
 }
 
 void MockPostOffice::deposit(
diff --git a/libmuscle/cpp/src/libmuscle/tests/tcp_transport_server_test.cpp b/libmuscle/cpp/src/libmuscle/tests/tcp_transport_server_test.cpp
index 9e069031..248f597f 100644
--- a/libmuscle/cpp/src/libmuscle/tests/tcp_transport_server_test.cpp
+++ b/libmuscle/cpp/src/libmuscle/tests/tcp_transport_server_test.cpp
@@ -38,6 +38,7 @@ int main(int argc, char *argv[]) {
             "test_sender.port", receiver, 10,
             0.0, 1.0,
             overlay_settings,
+            0,
             data_dict);
     auto msg_data = std::make_unique<DataConstRef>(msg.encoded());
     post_office.deposit(receiver, std::move(msg_data));
diff --git a/libmuscle/cpp/src/libmuscle/tests/test_mpp_message.cpp b/libmuscle/cpp/src/libmuscle/tests/test_mpp_message.cpp
index 88ba96c9..53f2ed28 100644
--- a/libmuscle/cpp/src/libmuscle/tests/test_mpp_message.cpp
+++ b/libmuscle/cpp/src/libmuscle/tests/test_mpp_message.cpp
@@ -29,7 +29,7 @@ TEST(test_mcp_message, create_mcp_message) {
             Reference("sender.port"), Reference("receiver.port"),
             10,
             100.1, 101.0,
-            test, abc
+            test, 0, abc
             );
 
     ASSERT_EQ(m.sender, "sender.port");
@@ -38,6 +38,7 @@ TEST(test_mcp_message, create_mcp_message) {
     ASSERT_EQ(m.timestamp, 100.1);
     ASSERT_EQ(m.next_timestamp, 101.0);
     ASSERT_EQ(m.settings_overlay.as<std::string>(), "test");
+    ASSERT_EQ(m.message_number, 0);
     ASSERT_EQ(m.data.as<std::string>(), "abc");
 }
 
@@ -48,7 +49,7 @@ TEST(test_mcp_message, create_mcp_message_minimal) {
             Reference("sender.port"), Reference("receiver.port"),
             {},
             100.1, {},
-            test, abc
+            test, 0, abc
             );
 
     ASSERT_EQ(m.sender, "sender.port");
@@ -57,6 +58,7 @@ TEST(test_mcp_message, create_mcp_message_minimal) {
     ASSERT_EQ(m.timestamp, 100.1);
     ASSERT_FALSE(m.next_timestamp.is_set());
     ASSERT_TRUE(m.settings_overlay.is_nil());
+    ASSERT_EQ(m.message_number, 0);
     ASSERT_TRUE(m.data.is_nil());
 }
 
@@ -68,6 +70,7 @@ TEST(test_mcp_message, from_bytes) {
             "timestamp", 100.1,
             "next_timestamp", Data(),
             "settings_overlay", Data(),
+            "message_number", 0,
             "data", Data()
             );
 
@@ -84,6 +87,7 @@ TEST(test_mcp_message, from_bytes) {
     ASSERT_EQ(m.timestamp, 100.1);
     ASSERT_FALSE(m.next_timestamp.is_set());
     ASSERT_TRUE(m.settings_overlay.is_nil());
+    ASSERT_EQ(m.message_number, 0);
     ASSERT_TRUE(m.data.is_nil());
 }
 
diff --git a/libmuscle/cpp/src/libmuscle/tests/test_outbox.cpp b/libmuscle/cpp/src/libmuscle/tests/test_outbox.cpp
index 07486cb3..0d6769c5 100644
--- a/libmuscle/cpp/src/libmuscle/tests/test_outbox.cpp
+++ b/libmuscle/cpp/src/libmuscle/tests/test_outbox.cpp
@@ -36,6 +36,7 @@ TEST(libmuscle_outbox, test_deposit_retrieve_message) {
             Optional<int>(),
             0.0, 1.0,
             DataConstRef(),
+            0,
             DataConstRef("testing"));
 
     auto message_data = std::make_unique<DataConstRef>(message.encoded());
diff --git a/libmuscle/cpp/src/libmuscle/tests/test_post_office.cpp b/libmuscle/cpp/src/libmuscle/tests/test_post_office.cpp
index 68af8bf7..f6cf05c2 100644
--- a/libmuscle/cpp/src/libmuscle/tests/test_post_office.cpp
+++ b/libmuscle/cpp/src/libmuscle/tests/test_post_office.cpp
@@ -36,7 +36,7 @@ std::unique_ptr<DataConstRef> make_message() {
             "test_sender.port", "test_receiver.port",
             Optional<int>(),
             0.0, 1.0,
-            DataConstRef(), DataConstRef());
+            DataConstRef(), 0, DataConstRef());
     return std::make_unique<DataConstRef>(msg.encoded());
 }
 
diff --git a/libmuscle/cpp/src/libmuscle/tests/test_tcp_communication.cpp b/libmuscle/cpp/src/libmuscle/tests/test_tcp_communication.cpp
index f9c60c30..c6400404 100644
--- a/libmuscle/cpp/src/libmuscle/tests/test_tcp_communication.cpp
+++ b/libmuscle/cpp/src/libmuscle/tests/test_tcp_communication.cpp
@@ -36,7 +36,7 @@ TEST(test_tcp_communication, send_receive) {
     MPPMessage msg(
             "test_sender.port", receiver, 10,
             0.0, 1.0,
-            Data::dict("par1", 13),
+            Data::dict("par1", 13), 1,
             Data::dict("var1", 1, "var2", 2.0, "var3", "3"));
     auto msg_data = std::make_unique<DataConstRef>(msg.encoded());
     post_office.deposit(receiver, std::move(msg_data));
@@ -53,6 +53,7 @@ TEST(test_tcp_communication, send_receive) {
     ASSERT_EQ(m.timestamp, 0.0);
     ASSERT_EQ(m.next_timestamp, 1.0);
     ASSERT_EQ(m.settings_overlay["par1"].as<int>(), 13);
+    ASSERT_EQ(m.message_number, 1);
     ASSERT_EQ(m.data["var1"].as<int>(), 1);
     ASSERT_EQ(m.data["var2"].as<double>(), 2.0);
     ASSERT_EQ(m.data["var3"].as<std::string>(), "3");

From 091c0511702c48a73bfc8822b9c296e4788efbd0 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 29 Aug 2022 10:58:12 +0200
Subject: [PATCH 017/183] Add missed slot argument

---
 libmuscle/python/libmuscle/communicator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py
index af7e14bd..f565f6c5 100644
--- a/libmuscle/python/libmuscle/communicator.py
+++ b/libmuscle/python/libmuscle/communicator.py
@@ -322,7 +322,7 @@ def receive_message(self, port_name: str, slot: Optional[int] = None,
                     port.is_resuming(slot)):
                 _logger.debug(f'Discarding received message on {port_and_slot}'
                               ': resuming from weakly consistent snapshot')
-                port.set_resumed()
+                port.set_resumed(slot)
                 return self.receive_message(port_name, slot, default)
             raise RuntimeError(f'Received message on {port_and_slot} with'
                                ' unexpected message number'

From 2da60951f3dbdd9f47584cddb41197b9f11374be Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 29 Aug 2022 13:04:58 +0200
Subject: [PATCH 018/183] Getting/restoring port message counts

---
 libmuscle/cpp/src/libmuscle/port.cpp | 23 +++++++++++++++--------
 libmuscle/cpp/src/libmuscle/port.hpp | 13 +++++++++++--
 libmuscle/python/libmuscle/port.py   | 25 ++++++++++++++++---------
 3 files changed, 42 insertions(+), 19 deletions(-)

diff --git a/libmuscle/cpp/src/libmuscle/port.cpp b/libmuscle/cpp/src/libmuscle/port.cpp
index c4e7b4f9..70db0550 100644
--- a/libmuscle/cpp/src/libmuscle/port.cpp
+++ b/libmuscle/cpp/src/libmuscle/port.cpp
@@ -29,8 +29,7 @@ namespace libmuscle { namespace impl {
 Port::Port(
         std::string const & name, Operator oper,
         bool is_vector, bool is_connected,
-        int our_ndims, std::vector<int> peer_dims,
-        std::vector<int> num_messages)
+        int our_ndims, std::vector<int> peer_dims)
     : ::ymmsl::Port(Identifier(name), oper)
 {
     is_connected_ = is_connected;
@@ -67,12 +66,8 @@ Port::Port(
     }
 
     is_resizable_ = is_vector && (our_ndims == static_cast<int>(peer_dims.size()));
-    if (!num_messages.empty()) {
-        num_messages_ = num_messages;
-        is_resuming_.resize(num_messages_.size(), true);
-    }
-    extend_vector_to_size(num_messages_, std::max(1, length_), 0);
-    extend_vector_to_size(is_resuming_, std::max(1, length_), false);
+    num_messages_.resize(std::max(1, length_), 0);
+    is_resuming_.resize(std::max(1, length_), false);
 }
 
 bool Port::is_connected() const {
@@ -130,6 +125,18 @@ void Port::set_closed(int slot) {
     is_open_[slot] = false;
 }
 
+void Port::restore_message_counts(const std::vector<int> &num_messages) {
+    num_messages_ = std::vector<int>(num_messages);
+    is_resuming_.clear();
+    is_resuming_.resize(num_messages_.size(), true);
+    extend_vector_to_size(num_messages_, std::max(1, length_), 0);
+    extend_vector_to_size(is_resuming_, std::max(1, length_), false);
+}
+
+const std::vector<int> & Port::get_message_counts() const {
+    return num_messages_;
+}
+
 void Port::increment_num_messages() {
     num_messages_[0] ++;
     set_resumed();
diff --git a/libmuscle/cpp/src/libmuscle/port.hpp b/libmuscle/cpp/src/libmuscle/port.hpp
index 913a1917..18cfb5d9 100644
--- a/libmuscle/cpp/src/libmuscle/port.hpp
+++ b/libmuscle/cpp/src/libmuscle/port.hpp
@@ -30,8 +30,7 @@ class Port : public ::ymmsl::Port {
         Port(
                 std::string const & name, ::ymmsl::Operator oper,
                 bool is_vector, bool is_connected,
-                int our_ndims, std::vector<int> peer_dims,
-                std::vector<int> num_messages=std::vector<int>(0));
+                int our_ndims, std::vector<int> peer_dims);
 
         // Note: we only ever use this Port in libmuscle, and only use
         // ymmsl::Port in ymmsl. Port objects are always handled by value, so
@@ -109,6 +108,16 @@ class Port : public ::ymmsl::Port {
          */
         void set_closed(int slot);
 
+        /** Restore message counts from a snapshot.
+         *
+         * @param num_messages message counts of the snapshot
+         */
+        void restore_message_counts(const std::vector<int> &num_messages);
+
+        /** Get the message counts for all slots in this port
+         */
+        const std::vector<int> & get_message_counts() const;
+
         /** Increment amount of messages sent or received.
          */
         void increment_num_messages();
diff --git a/libmuscle/python/libmuscle/port.py b/libmuscle/python/libmuscle/port.py
index 9aced2d5..a6f955a5 100644
--- a/libmuscle/python/libmuscle/port.py
+++ b/libmuscle/python/libmuscle/port.py
@@ -32,8 +32,7 @@ class Port(ymmsl.Port):
     """
 
     def __init__(self, name: str, operator: Operator, is_vector: bool,
-                 is_connected: bool, our_ndims: int, peer_dims: List[int],
-                 num_messages: Optional[List[int]] = None
+                 is_connected: bool, our_ndims: int, peer_dims: List[int]
                  ) -> None:
         """Create a Port.
 
@@ -86,13 +85,8 @@ def __init__(self, name: str, operator: Operator, is_vector: bool,
             self._is_open = [True]
 
         self._is_resizable = is_vector and (our_ndims == len(peer_dims))
-        self._num_messages = []  # type: List[int]
-        self._is_resuming = []  # type: List[bool]
-        if num_messages is not None:
-            self._num_messages = num_messages
-            self._is_resuming = [True] * len(num_messages)
-        _extend_list_to_size(self._num_messages, self._length or 1, 0)
-        _extend_list_to_size(self._is_resuming, self._length or 1, False)
+        self._num_messages = [0] * (self._length or 1)
+        self._is_resuming = [False] * (self._length or 1)
 
     # Note: I'm not sure how this will develop exactly, so this class has some
     # accessors even if those are un-Pythonic; in the future a simple variable
@@ -168,6 +162,19 @@ def set_closed(self, slot: Optional[int] = None) -> None:
         else:
             self._is_open = [False]
 
+    def restore_message_counts(self, num_messages: List[int]) -> None:
+        """Restore message counts from a snapshot
+        """
+        self._num_messages = num_messages
+        self._is_resuming = [True] * len(self._num_messages)
+        _extend_list_to_size(self._num_messages, self._length or 1, 0)
+        _extend_list_to_size(self._is_resuming, self._length or 1, False)
+
+    def get_message_counts(self) -> List[int]:
+        """Get a list of message counts for all slots in this port
+        """
+        return self._num_messages.copy()
+
     def increment_num_messages(self, slot: Optional[int] = None) -> None:
         """Increment amount of messages sent or received.
 

From d07929f604a0830a58c7c3ad9d9aa240ab9f2780 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 30 Aug 2022 09:22:14 +0200
Subject: [PATCH 019/183] Add a snapshot trigger manager and rename files

---
 .../python/libmuscle/checkpoint_triggers.py   | 337 ++++++++++++++++++
 .../python/libmuscle/snapshot_manager.py      | 151 --------
 ...manager.py => test_checkpoint_triggers.py} |  91 ++++-
 3 files changed, 425 insertions(+), 154 deletions(-)
 create mode 100644 libmuscle/python/libmuscle/checkpoint_triggers.py
 delete mode 100644 libmuscle/python/libmuscle/snapshot_manager.py
 rename libmuscle/python/libmuscle/test/{test_snapshot_manager.py => test_checkpoint_triggers.py} (58%)

diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py
new file mode 100644
index 00000000..214fd872
--- /dev/null
+++ b/libmuscle/python/libmuscle/checkpoint_triggers.py
@@ -0,0 +1,337 @@
+import bisect
+from datetime import datetime, timezone
+import logging
+import os
+import time
+from typing import List, Optional, Union
+
+from ymmsl import CheckpointRange, CheckpointRules, Checkpoints
+
+
+_logger = logging.getLogger(__name__)
+
+
+def _checkpoint_error(description: str) -> None:
+    if "MUSCLE_DISABLE_CHECKPOINT_ERRORS" in os.environ:
+        _logger.warning(f"Suppressed checkpoint error: {description}")
+    else:
+        raise RuntimeError(description)
+
+
+class CheckpointTrigger:
+    """Represents a trigger for creating snapshots"""
+
+    def next_checkpoint(self, cur_time: float) -> Optional[float]:
+        """Calculate the next checkpoint time
+
+        Args:
+            cur_time: current time.
+
+        Returns:
+            The time when a next checkpoint should be taken, or None if this
+            trigger has no checkpoint after cur_time.
+        """
+        raise NotImplementedError()
+
+    def previous_checkpoint(self, cur_time: float) -> Optional[float]:
+        """Calculate the previous checkpoint time
+
+        Args:
+            cur_time: current time.
+
+        Returns:
+            The time when a previous checkpoint should have been taken, or None
+            if this trigger has no checkpoint after cur_time.
+        """
+        raise NotImplementedError()
+
+
+class AtCheckpointTrigger(CheckpointTrigger):
+    """Represents a trigger based on an "at" checkpoint rule
+
+    This triggers at the specified times.
+    """
+
+    def __init__(self, at: List[Union[float, int]]) -> None:
+        """Create an "at" checkpoint trigger
+
+        Args:
+            at: list of checkpoint moments
+        """
+        self._at = at
+        self._at.sort()  # ymmsl already sorts, but just to be sure
+
+    def next_checkpoint(self, cur_time: float) -> Optional[float]:
+        if cur_time >= self._at[-1]:
+            return None  # no future checkpoint left
+        idx = bisect.bisect(self._at, cur_time)
+        return self._at[idx]
+
+    def previous_checkpoint(self, cur_time: float) -> Optional[float]:
+        if cur_time < self._at[0]:
+            return None  # no previous checkpoint
+        idx = bisect.bisect(self._at, cur_time)
+        return self._at[idx - 1]
+
+
+class RangeCheckpointTrigger(CheckpointTrigger):
+    """Represents a trigger based on a "ranges" checkpoint rule
+
+    This triggers at a range of checkpoint moments.
+
+    Equivalent an "at" rule ``[start, start + step, start + 2*step, ...]`` for
+    as long as ``start + i*step <= stop``.
+
+    Stop may be omitted, in which case the range is infinite.
+
+    Start may be omitted, in which case the range is equivalent to an "at" rule
+    ``[..., -n*step, ..., -step, 0, step, 2*step, ...]`` for as long as
+    ``i*step <= stop``.
+
+    Note: the "every" rule is a special case of a range with start and stop
+    omitted, and is handled by this class as well
+    """
+
+    def __init__(self, range: CheckpointRange) -> None:
+        """Create a range of checkpoints
+
+        Args:
+            range: checkpoint range defining start, stop and step.
+        """
+        self._start = range.start
+        self._stop = range.stop
+        self._step = range.step
+        self._last = None  # type: Union[int, float, None]
+        if self._stop is not None:
+            start = 0 if self._start is None else self._start
+            diff = self._stop - start
+            self._last = start + (diff // self._step) * self._step
+
+    def next_checkpoint(self, cur_time: float) -> Optional[float]:
+        if self._start is not None and cur_time < self._start:
+            return float(self._start)
+        if self._last is not None and cur_time >= self._last:
+            return None
+        start = 0 if self._start is None else self._start
+        diff = cur_time - start
+        return float(start + (diff // self._step + 1) * self._step)
+
+    def previous_checkpoint(self, cur_time: float) -> Optional[float]:
+        if self._start is not None and cur_time < self._start:
+            return None
+        if self._last is not None and cur_time > self._last:
+            return float(self._last)
+        start = 0 if self._start is None else self._start
+        diff = cur_time - start
+        return float(start + (diff // self._step) * self._step)
+
+
+class CombinedCheckpointTriggers(CheckpointTrigger):
+    """Checkpoint trigger based on a combination of "every", "at" and "ranges"
+    """
+
+    def __init__(self, checkpoint_rules: Optional[CheckpointRules]) -> None:
+        """Create a new combined checkpoint trigger from the given rules
+
+        Args:
+            checkpoint_rules: checkpoint rules (from ymmsl) defining "every",
+                "at", and/or "ranges" rules
+        """
+        self._triggers = []  # type: List[CheckpointTrigger]
+        if checkpoint_rules is None:
+            return
+        if checkpoint_rules.every is not None:
+            cp_range = CheckpointRange(step=checkpoint_rules.every)
+            self._triggers.append(RangeCheckpointTrigger(cp_range))
+        if checkpoint_rules.at:
+            self._triggers.append(AtCheckpointTrigger(checkpoint_rules.at))
+        for cp_range in checkpoint_rules.ranges:
+            self._triggers.append(RangeCheckpointTrigger(cp_range))
+
+    def next_checkpoint(self, cur_time: float) -> Optional[float]:
+        checkpoints = (trigger.next_checkpoint(cur_time)
+                       for trigger in self._triggers)
+        # return earliest of all not-None next-checkpoints
+        return min((checkpoint
+                    for checkpoint in checkpoints
+                    if checkpoint is not None),
+                   default=None)  # return None if all triggers return None
+
+    def previous_checkpoint(self, cur_time: float) -> Optional[float]:
+        checkpoints = (trigger.previous_checkpoint(cur_time)
+                       for trigger in self._triggers)
+        # return latest of all not-None previous-checkpoints
+        return max((checkpoint
+                    for checkpoint in checkpoints
+                    if checkpoint is not None),
+                   default=None)  # return None if all triggers return None
+
+
+def _utc_to_monotonic(utc: datetime) -> float:
+    """Convert UTC time point to a reference value of time.monotonic()
+
+    Args:
+        utc: datetime in UTC timezone
+    """
+    curmono = time.monotonic()
+    curutc = datetime.now(timezone.utc)
+    elapsed_seconds = (curutc - utc).total_seconds()
+    return curmono - elapsed_seconds
+
+
+class TriggerManager:
+    """Manages all checkpoint triggers and checks if a snapshot must be saved.
+    """
+
+    def __init__(self, reference_utctime: datetime, checkpoints: Checkpoints
+                 ) -> None:
+        self._monotonic_reference = _utc_to_monotonic(reference_utctime)
+
+        self._wall = CombinedCheckpointTriggers(checkpoints.wallclocktime)
+        self._prevwall = 0.0
+        self._nextwall = self._wall.next_checkpoint(0.0)  # type: Optional[float]
+
+        self._sim = CombinedCheckpointTriggers(checkpoints.simulationtime)
+        self._prevsim = None        # type: Optional[float]
+        self._nextsim = None        # type: Optional[float]
+        self._sim_reset = True
+
+        self._last_triggers = []    # type: List[str]
+        self._first_reuse = True
+
+        # These attributes are only used to check if implementations are
+        # following the guidelines
+        self._should_have_saved = False
+        self._should_save_final_called = False
+        self._saved_final_checkpoint = False
+
+    def elapsed_walltime(self) -> float:
+        """Returns elapsed wallclocktime in seconds.
+        """
+        return time.monotonic() - self._monotonic_reference
+
+    def should_save_snapshot(self, timestamp: float,
+                             next_timestamp: Optional[float]) -> bool:
+        """Handles instance.should_save_snapshot
+        """
+        if self._should_have_saved:
+            _checkpoint_error('"should_save_snapshot" or '
+                              '"should_save_final_snapshot" returned positive'
+                              ' but no snapshot was saved before the next call')
+
+        value = False
+        elapsed_walltime = self.elapsed_walltime()
+        if next_timestamp is None:
+            _logger.warning('No "next_timestamp" provided. Workflow may not'
+                            ' be able to create a consistent snapshot. See '
+                            'https://muscle3.readthedocs.io/en/latest/checkpoints.html')
+            value = self.__should_save(elapsed_walltime, timestamp)
+        else:
+            value = self.__should_save(elapsed_walltime, next_timestamp)
+        self._should_have_saved = value
+        return value
+
+    def should_save_final_snapshot(self, timestamp: float) -> bool:
+        """Handles instance.should_save_final_snapshot
+        """
+        if self._should_have_saved:
+            _checkpoint_error('"should_save_snapshot" or '
+                              '"should_save_final_snapshot" returned positive'
+                              ' but no snapshot was saved before the next call')
+
+        value = False
+        if self._max_f_init_next_timestamp is None:
+            # If the messages on F_INIT do not supply a next_timestamp, we will
+            # always snapshot just before O_I
+            value = True
+            self._last_triggers = ['No "next_timestamp" provided on F_INIT'
+                                   ' messages']
+        else:
+            elapsed_walltime = self.elapsed_walltime()
+            value = self.__should_save(elapsed_walltime,
+                                       self._max_f_init_next_timestamp)
+
+        self._should_have_saved = value
+        self._should_save_final_called = True
+        return value
+
+    def reuse_instance(self, max_f_init_next_timestamp: Optional[float]
+                       ) -> None:
+        """Cleanup between instance reuse
+
+        Args:
+            max_f_init_next_timestamp: the maximum next_timestamp of all
+                messages pre--received during F_INIT.
+        """
+        self._max_f_init_next_timestamp = max_f_init_next_timestamp
+
+        if self._first_reuse:
+            self._first_reuse = False
+        else:
+            if self._should_have_saved:
+                _checkpoint_error('"should_save_snapshot" or '
+                                  '"should_save_final_snapshot" returned'
+                                  ' positive but no snapshot was saved before'
+                                  ' exiting the reuse loop.')
+            if not (self._should_save_final_called or self._saved_final_checkpoint):
+                _checkpoint_error('You must call "should_save_final" exactly'
+                                  ' once in the reuse loop of an instance that'
+                                  ' supports checkpointing.')
+            self._should_save_final_called = False
+            self._saved_final_checkpoint = False
+
+    def update_checkpoints(self, simulationtime: float, final: bool) -> float:
+        """Update last and next checkpoint times when a snapshot is made
+
+        Args:
+            simulationtime: next timestamp as reported by the instance (if
+                available, otherwise current timestamp)
+
+        Returns:
+            Current elapsed walltime
+        """
+        self._prevwall = self.elapsed_walltime()
+        self._nextwall = self._wall.next_checkpoint(self._prevwall)
+
+        if final and self._max_f_init_next_timestamp is not None:
+            simulationtime = self._max_f_init_next_timestamp
+        self._prevsim = simulationtime
+        self._nextsim = self._sim.next_checkpoint(simulationtime)
+
+        self._should_have_saved = False
+        self._saved_final_checkpoint = final
+        return self._prevwall
+
+    def get_triggers(self) -> List[str]:
+        """Get trigger description(s) for the current reason for checkpointing.
+        """
+        triggers = self._last_triggers
+        self._last_triggers = []
+        return triggers
+
+    def __should_save(self, walltime: float, simulationtime: float) -> bool:
+        """Check if a checkpoint should be taken
+
+        Args:
+            walltime: current wallclock time (elapsed since reference)
+            simulationtime: current/next timestamp as reported by the instance
+        """
+        if self._sim_reset:
+            # we cannot make assumptions about the start time of a simulation,
+            # a t=-1000 could make sense if t represents years since CE
+            # and we should not disallow checkpointing for negative t
+            previous = self._sim.previous_checkpoint(simulationtime)
+            if previous is not None:
+                # there is a checkpoint rule before the current moment, assume
+                # we should have taken a snapshot back then
+                self._nextsim = previous
+            else:
+                self._nextsim = self._sim.next_checkpoint(simulationtime)
+            self._sim_reset = False
+
+        self._last_triggers = []
+        if self._nextwall is not None and walltime >= self._nextwall:
+            self._last_triggers.append(f"wallclocktime >= {self._nextwall}")
+        if self._nextsim is not None and simulationtime >= self._nextsim:
+            self._last_triggers.append(f"simulationtime >= {self._nextsim}")
+        return bool(self._last_triggers)
diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py
deleted file mode 100644
index 36b742d8..00000000
--- a/libmuscle/python/libmuscle/snapshot_manager.py
+++ /dev/null
@@ -1,151 +0,0 @@
-import bisect
-from typing import List, Optional, Union
-
-from ymmsl import CheckpointRange, CheckpointRules
-
-
-class CheckpointTrigger:
-    """Represents a trigger for creating snapshots"""
-
-    def next_checkpoint(self, cur_time: float) -> Optional[float]:
-        """Calculate the next checkpoint time
-
-        Args:
-            cur_time: current time.
-
-        Returns:
-            The time when a next checkpoint should be taken, or None if this
-            trigger has no checkpoint after cur_time.
-        """
-        raise NotImplementedError()
-
-    def previous_checkpoint(self, cur_time: float) -> Optional[float]:
-        """Calculate the previous checkpoint time
-
-        Args:
-            cur_time: current time.
-
-        Returns:
-            The time when a previous checkpoint should have been taken, or None
-            if this trigger has no checkpoint after cur_time.
-        """
-        raise NotImplementedError()
-
-
-class AtCheckpointTrigger(CheckpointTrigger):
-    """Represents a trigger based on an "at" checkpoint rule
-
-    This triggers at the specified times.
-    """
-
-    def __init__(self, at: List[Union[float, int]]) -> None:
-        """Create an "at" checkpoint trigger
-
-        Args:
-            at: list of checkpoint moments
-        """
-        self._at = at
-        self._at.sort()  # ymmsl already sorts, but just to be sure
-
-    def next_checkpoint(self, cur_time: float) -> Optional[float]:
-        if cur_time >= self._at[-1]:
-            return None  # no future checkpoint left
-        idx = bisect.bisect(self._at, cur_time)
-        return self._at[idx]
-
-    def previous_checkpoint(self, cur_time: float) -> Optional[float]:
-        if cur_time < self._at[0]:
-            return None  # no previous checkpoint
-        idx = bisect.bisect(self._at, cur_time)
-        return self._at[idx - 1]
-
-
-class RangeCheckpointTrigger(CheckpointTrigger):
-    """Represents a trigger based on a "ranges" checkpoint rule
-
-    This triggers at a range of checkpoint moments.
-
-    Equivalent an "at" rule ``[start, start + step, start + 2*step, ...]`` for
-    as long as ``start + i*step <= stop``.
-
-    Stop may be omitted, in which case the range is infinite.
-
-    Start may be omitted, in which case the range is equivalent to an "at" rule
-    ``[..., -n*step, ..., -step, 0, step, 2*step, ...]`` for as long as
-    ``i*step <= stop``.
-
-    Note: the "every" rule is a special case of a range with start and stop
-    omitted, and is handled by this class as well
-    """
-
-    def __init__(self, range: CheckpointRange) -> None:
-        """Create a range of checkpoints
-
-        Args:
-            range: checkpoint range defining start, stop and step.
-        """
-        self._start = range.start
-        self._stop = range.stop
-        self._step = range.step
-        self._last = None  # type: Union[int, float, None]
-        if self._stop is not None:
-            start = 0 if self._start is None else self._start
-            diff = self._stop - start
-            self._last = start + (diff // self._step) * self._step
-
-    def next_checkpoint(self, cur_time: float) -> Optional[float]:
-        if self._start is not None and cur_time < self._start:
-            return float(self._start)
-        if self._last is not None and cur_time >= self._last:
-            return None
-        start = 0 if self._start is None else self._start
-        diff = cur_time - start
-        return float(start + (diff // self._step + 1) * self._step)
-
-    def previous_checkpoint(self, cur_time: float) -> Optional[float]:
-        if self._start is not None and cur_time < self._start:
-            return None
-        if self._last is not None and cur_time > self._last:
-            return float(self._last)
-        start = 0 if self._start is None else self._start
-        diff = cur_time - start
-        return float(start + (diff // self._step) * self._step)
-
-
-class CombinedCheckpointTriggers(CheckpointTrigger):
-    """Checkpoint trigger based on a combination of "every", "at" and "ranges"
-    """
-
-    def __init__(self, checkpoint_rules: CheckpointRules) -> None:
-        """Create a new combined checkpoint trigger from the given rules
-
-        Args:
-            checkpoint_rules: checkpoint rules (from ymmsl) defining "every",
-                "at", and/or "ranges" rules
-        """
-        self._triggers = []  # type: List[CheckpointTrigger]
-        if checkpoint_rules.every is not None:
-            cp_range = CheckpointRange(step=checkpoint_rules.every)
-            self._triggers.append(RangeCheckpointTrigger(cp_range))
-        if checkpoint_rules.at:
-            self._triggers.append(AtCheckpointTrigger(checkpoint_rules.at))
-        for cp_range in checkpoint_rules.ranges:
-            self._triggers.append(RangeCheckpointTrigger(cp_range))
-
-    def next_checkpoint(self, cur_time: float) -> Optional[float]:
-        checkpoints = (trigger.next_checkpoint(cur_time)
-                       for trigger in self._triggers)
-        # return earliest of all not-None next-checkpoints
-        return min((checkpoint
-                    for checkpoint in checkpoints
-                    if checkpoint is not None),
-                   default=None)  # return None if all triggers return None
-
-    def previous_checkpoint(self, cur_time: float) -> Optional[float]:
-        checkpoints = (trigger.previous_checkpoint(cur_time)
-                       for trigger in self._triggers)
-        # return latest of all not-None previous-checkpoints
-        return max((checkpoint
-                    for checkpoint in checkpoints
-                    if checkpoint is not None),
-                   default=None)  # return None if all triggers return None
diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
similarity index 58%
rename from libmuscle/python/libmuscle/test/test_snapshot_manager.py
rename to libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
index b557e5b8..6cb645c2 100644
--- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py
+++ b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
@@ -1,8 +1,12 @@
+from datetime import datetime, timedelta, timezone
+import logging
+import time
 import pytest
-from ymmsl import CheckpointRange, CheckpointRules
+from ymmsl import CheckpointRange, CheckpointRules, Checkpoints
 
-from libmuscle.snapshot_manager import (
-    CombinedCheckpointTriggers, AtCheckpointTrigger, RangeCheckpointTrigger)
+from libmuscle.checkpoint_triggers import (
+    CombinedCheckpointTriggers, AtCheckpointTrigger, RangeCheckpointTrigger,
+    TriggerManager)
 
 
 def test_at_checkpoint_trigger():
@@ -131,3 +135,84 @@ def test_combined_checkpoint_trigger_at_ranges():
 
     assert trigger.next_checkpoint(125.2) is None
     assert trigger.previous_checkpoint(125.2) == pytest.approx(100)
+
+
+def test_trigger_manager_reference_time():
+    monotonic_now = time.monotonic()
+    utcnow = datetime.now(timezone.utc)
+    reference = utcnow - timedelta(seconds=15)
+    trigger_manager = TriggerManager(reference, Checkpoints())
+    elapsed_walltime = trigger_manager.elapsed_walltime()
+    elapsed_monotonic = time.monotonic() - monotonic_now
+    assert 15.0 < elapsed_walltime <= (15.0 + elapsed_monotonic)
+
+
+def test_trigger_manager():
+    reference = datetime.now(timezone.utc)
+    trigger_manager = TriggerManager(reference, Checkpoints(
+            wallclocktime=CheckpointRules(at=[1e-12]),
+            simulationtime=CheckpointRules(at=[1, 3, 5])))
+
+    trigger_manager.reuse_instance(7)
+
+    t, t_next = 0.1, 0.2
+    assert trigger_manager.should_save_snapshot(t, t_next)
+    triggers = trigger_manager.get_triggers()
+    assert len(triggers) == 1
+    assert "wallclocktime" in triggers[0]
+    with pytest.raises(RuntimeError):  # did not call save in between
+        trigger_manager.should_save_snapshot(t, t_next)
+    trigger_manager.update_checkpoints(t_next, False)
+
+    t, t_next = 0.2, 0.9
+    assert not trigger_manager.should_save_snapshot(t, t_next)
+
+    t, t_next = 0.9, 3.1
+    assert trigger_manager.should_save_snapshot(t, t_next)
+    assert len(trigger_manager.get_triggers()) == 1
+    trigger_manager.update_checkpoints(t_next, False)
+
+    t, t_next = 3.1, None
+    assert trigger_manager.should_save_final_snapshot(t)
+    with pytest.raises(RuntimeError):  # did not call save in between
+        trigger_manager.should_save_snapshot(t, 4.0)
+    with pytest.raises(RuntimeError):  # did not call save in between
+        trigger_manager.should_save_final_snapshot(t)
+    assert len(trigger_manager.get_triggers()) > 0
+    trigger_manager.update_checkpoints(t, True)
+
+    trigger_manager.reuse_instance(None)
+
+    t, t_next = 7.1, 8.2
+    assert not trigger_manager.should_save_snapshot(t, t_next)
+    with pytest.raises(RuntimeError):  # no should_save_final called
+        trigger_manager.reuse_instance(None)
+    t, t_next = 8.2, None
+    assert trigger_manager.should_save_final_snapshot(t)
+    with pytest.raises(RuntimeError):  # not saved
+        trigger_manager.reuse_instance(None)
+    trigger_manager.update_checkpoints(t, True)
+
+    trigger_manager.reuse_instance(None)
+
+
+def test_trigger_manager_warnings(caplog: pytest.LogCaptureFixture,
+                                  monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setenv("MUSCLE_DISABLE_CHECKPOINT_ERRORS", "1")
+
+    reference = datetime.now(timezone.utc)
+    trigger_manager = TriggerManager(reference, Checkpoints(
+            simulationtime=CheckpointRules(at=[1, 3, 5])))
+
+    trigger_manager.reuse_instance(2)
+
+    with caplog.at_level(logging.WARN):
+        n_records = len(caplog.records)
+        assert trigger_manager.should_save_snapshot(1.5, None)
+        assert len(caplog.records) == n_records + 1
+        assert "next_timestamp" in caplog.records[-1].message
+
+        n_records = len(caplog.records)
+        trigger_manager.reuse_instance(None)  # suppressed error
+        assert len(caplog.records) > n_records
+        assert "Suppressed checkpoint error" in caplog.records[-1].message

From 54c5e1ec4eef2d3348ac78ad974cdc4af73a7f75 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 30 Aug 2022 15:17:44 +0200
Subject: [PATCH 020/183] Communicator get/restore port message counts

And unit tests for communicator changes
---
 libmuscle/python/libmuscle/communicator.py    |  24 ++++
 .../libmuscle/test/test_communicator.py       | 127 ++++++++++++++++++
 2 files changed, 151 insertions(+)

diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py
index f565f6c5..7ffff004 100644
--- a/libmuscle/python/libmuscle/communicator.py
+++ b/libmuscle/python/libmuscle/communicator.py
@@ -366,6 +366,30 @@ def shutdown(self) -> None:
         for server in self._servers:
             server.close()
 
+    def restore_message_counts(self, port_message_counts: Dict[str, List[int]]
+                               ) -> None:
+        """Restore message counts on all ports
+        """
+        for port_name, num_messages in port_message_counts.items():
+            if port_name == "muscle_settings_in":
+                self._muscle_settings_in.restore_message_counts(num_messages)
+            elif port_name in self._ports:
+                self._ports[port_name].restore_message_counts(num_messages)
+            else:
+                raise RuntimeError(f'Unknown port {port_name} in snapshot.'
+                                   ' Have your port definitions changed since'
+                                   ' the snapshot was taken?')
+        # TODO decide if we should check whether all ports are covered
+
+    def get_message_counts(self) -> Dict[str, List[int]]:
+        """Get message counts for all ports on the communicator
+        """
+        port_message_counts = {port_name: port.get_message_counts()
+                               for port_name, port in self._ports.items()}
+        port_message_counts["muscle_settings_in"] = \
+            self._muscle_settings_in.get_message_counts()
+        return port_message_counts
+
     def __instance_id(self) -> Reference:
         """Returns our complete instance id.
         """
diff --git a/libmuscle/python/libmuscle/test/test_communicator.py b/libmuscle/python/libmuscle/test/test_communicator.py
index a4f3a751..8fb7a527 100644
--- a/libmuscle/python/libmuscle/test/test_communicator.py
+++ b/libmuscle/python/libmuscle/test/test_communicator.py
@@ -1,3 +1,4 @@
+import logging
 from libmuscle.communicator import Communicator, Endpoint, Message
 from libmuscle.mpp_message import ClosePort, MPPMessage
 from libmuscle.port import Port
@@ -67,6 +68,8 @@ def gpe(p, s) -> Reference:
     communicator._ports = {
             'out': Port('out', Operator.O_I, False, True, 1, []),
             'in': Port('in', Operator.S, False, True, 1, [])}
+    communicator._muscle_settings_in = \
+        communicator._Communicator__settings_in_port([])
     yield communicator
     communicator.shutdown()
 
@@ -103,6 +106,8 @@ def gpe(p, s) -> Reference:
     communicator._ports = {
             'out': Port('out', Operator.O_I, True, True, 0, [20]),
             'in': Port('in', Operator.S, True, True, 0, [20])}
+    communicator._muscle_settings_in = \
+        communicator._Communicator__settings_in_port([])
     yield communicator
     communicator.shutdown()
 
@@ -554,3 +559,125 @@ def test_get_message(communicator, message) -> None:
             None, 0.0, None, Settings(), 0, b'test').encoded()
     assert communicator._post_office.get_message(
             'other.in[13]') == ref_message
+
+
+def test_port_message_counts(communicator, message) -> None:
+    communicator.send_message('out', message)
+    msg_counts = communicator.get_message_counts()
+    assert msg_counts == {'out': [1],
+                          'in': [0],
+                          'muscle_settings_in': [0]}
+
+    communicator.restore_message_counts({'out': [3],
+                                         'in': [2],
+                                         'muscle_settings_in': [4]})
+    communicator.send_message('out', message)
+    msg_counts = communicator.get_message_counts()
+    assert msg_counts == {'out': [4],
+                          'in': [2],
+                          'muscle_settings_in': [4]}
+
+    # empty post office
+    communicator._post_office.get_message('other.in[13]')
+    communicator._post_office.get_message('other.in[13]')
+
+    with pytest.raises(RuntimeError):
+        communicator.restore_message_counts({"x?invalid_port": 3})
+
+
+def test_vector_port_message_counts(communicator2, message) -> None:
+    msg_counts = communicator2.get_message_counts()
+    assert msg_counts == {'out': [0] * 20,
+                          'in': [0] * 20,
+                          'muscle_settings_in': [0]}
+
+    communicator2.send_message('out', message, 13)
+    msg_counts = communicator2.get_message_counts()
+    assert msg_counts == {'out': [0] * 13 + [1] + [0] * 6,
+                          'in': [0] * 20,
+                          'muscle_settings_in': [0]}
+
+    communicator2.restore_message_counts({'out': list(range(20)),
+                                          'in': list(range(20)),
+                                          'muscle_settings_in': [4]})
+    communicator2.send_message('out', message, 13)
+    msg_counts = communicator2.get_message_counts()
+    assert msg_counts == {'out': list(range(13)) + [14] + list(range(14, 20)),
+                          'in': list(range(20)),
+                          'muscle_settings_in': [4]}
+
+    # empty post office
+    communicator2._post_office.get_message('kernel[13].in')
+    communicator2._post_office.get_message('kernel[13].in')
+
+
+def test_port_count_validation(communicator):
+    client_mock = MagicMock()
+    client_mock.receive.return_value = MPPMessage(
+            Reference('other.out[13]'), Reference('kernel[13].in'),
+            None, 0.0, None, Settings({'test1': 12}), 0,
+            b'test').encoded()
+    get_client_mock = MagicMock(return_value=client_mock)
+    communicator._Communicator__get_client = get_client_mock
+    communicator._profiler = MagicMock()
+
+    communicator.receive_message('in')
+    assert communicator.get_message_counts()['in'] == [1]
+
+    with pytest.raises(RuntimeError):
+        # the message received has message_number = 0 again
+        communicator.receive_message('in')
+
+
+def test_port_discard_error_on_resume(caplog, communicator):
+    client_mock = MagicMock()
+    client_mock.receive.return_value = MPPMessage(
+            Reference('other.out[13]'), Reference('kernel[13].in'),
+            None, 0.0, None, Settings({'test1': 12}), 1,
+            b'test').encoded()
+    get_client_mock = MagicMock(return_value=client_mock)
+    communicator._Communicator__get_client = get_client_mock
+    communicator._profiler = MagicMock()
+
+    communicator.restore_message_counts({'out': [0],
+                                         'in': [2],
+                                         'muscle_settings_in': [0]})
+    for port in communicator._ports.values():
+        assert port._is_resuming == [True]
+        assert port.is_resuming(None)
+
+    # In the next block, the first message with message_number=1 is discarded.
+    # The RuntimeError is raised when 'receiving' the second message with
+    # message_number=1
+    with caplog.at_level(logging.DEBUG):
+        with pytest.raises(RuntimeError):
+            communicator.receive_message('in')
+        # records 0, 2 and 3 are debug logs for starting/receiving on port
+        assert 'Discarding received message' in caplog.records[1].message
+
+
+def test_port_discard_success_on_resume(caplog, communicator):
+    client_mock = MagicMock()
+    client_mock.receive.side_effect = [MPPMessage(
+            Reference('other.out[13]'), Reference('kernel[13].in'),
+            None, 0.0, None, Settings({'test1': 12}), message_number,
+            {'this is message': message_number}).encoded()
+            for message_number in [1, 2]]
+    get_client_mock = MagicMock(return_value=client_mock)
+    communicator._Communicator__get_client = get_client_mock
+    communicator._profiler = MagicMock()
+
+    communicator.restore_message_counts({'out': [0],
+                                         'in': [2],
+                                         'muscle_settings_in': [0]})
+    for port in communicator._ports.values():
+        assert port._is_resuming == [True]
+        assert port.is_resuming(None)
+
+    with caplog.at_level(logging.DEBUG):
+        msg = communicator.receive_message('in')
+        # records 0, 2 and 3 are debug logs for starting/receiving on port
+        assert 'Discarding received message' in caplog.records[1].message
+    # message_number=1 should be discarded:
+    assert msg.data == {'this is message': 2}
+    assert communicator.get_message_counts()['in'] == [3]

From dbaaf94fbfae8c82f00e3ae453a95f7b52e7b824 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 30 Aug 2022 16:10:48 +0200
Subject: [PATCH 021/183] caplog fix when running full test suite

---
 libmuscle/python/libmuscle/test/test_communicator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libmuscle/python/libmuscle/test/test_communicator.py b/libmuscle/python/libmuscle/test/test_communicator.py
index 8fb7a527..bb41cfa5 100644
--- a/libmuscle/python/libmuscle/test/test_communicator.py
+++ b/libmuscle/python/libmuscle/test/test_communicator.py
@@ -649,7 +649,7 @@ def test_port_discard_error_on_resume(caplog, communicator):
     # In the next block, the first message with message_number=1 is discarded.
     # The RuntimeError is raised when 'receiving' the second message with
     # message_number=1
-    with caplog.at_level(logging.DEBUG):
+    with caplog.at_level(logging.DEBUG, 'libmuscle.communicator'):
         with pytest.raises(RuntimeError):
             communicator.receive_message('in')
         # records 0, 2 and 3 are debug logs for starting/receiving on port
@@ -674,7 +674,7 @@ def test_port_discard_success_on_resume(caplog, communicator):
         assert port._is_resuming == [True]
         assert port.is_resuming(None)
 
-    with caplog.at_level(logging.DEBUG):
+    with caplog.at_level(logging.DEBUG, 'libmuscle.communicator'):
         msg = communicator.receive_message('in')
         # records 0, 2 and 3 are debug logs for starting/receiving on port
         assert 'Discarding received message' in caplog.records[1].message

From c6058c0cf5ef721322c6f0066ec46eb7d59adb0b Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 30 Aug 2022 16:12:37 +0200
Subject: [PATCH 022/183] Handle simulationtime in TriggerManager

---
 .../python/libmuscle/checkpoint_triggers.py   | 21 +++++++++++--------
 .../test/test_checkpoint_triggers.py          |  8 +++----
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py
index 214fd872..8bf343fb 100644
--- a/libmuscle/python/libmuscle/checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/checkpoint_triggers.py
@@ -183,9 +183,9 @@ class TriggerManager:
     """Manages all checkpoint triggers and checks if a snapshot must be saved.
     """
 
-    def __init__(self, reference_utctime: datetime, checkpoints: Checkpoints
+    def __init__(self, utc_reference: datetime, checkpoints: Checkpoints
                  ) -> None:
-        self._monotonic_reference = _utc_to_monotonic(reference_utctime)
+        self._monotonic_reference = _utc_to_monotonic(utc_reference)
 
         self._wall = CombinedCheckpointTriggers(checkpoints.wallclocktime)
         self._prevwall = 0.0
@@ -280,27 +280,30 @@ def reuse_instance(self, max_f_init_next_timestamp: Optional[float]
             self._should_save_final_called = False
             self._saved_final_checkpoint = False
 
-    def update_checkpoints(self, simulationtime: float, final: bool) -> float:
+    def update_checkpoints(self, timestamp: float,
+                           next_timestamp: Optional[float], final: bool
+                           ) -> None:
         """Update last and next checkpoint times when a snapshot is made
 
         Args:
-            simulationtime: next timestamp as reported by the instance (if
-                available, otherwise current timestamp)
-
-        Returns:
-            Current elapsed walltime
+            timestamp: timestamp as reported by the instance
+            next_timestamp: next timestamp as reported by the instance
         """
         self._prevwall = self.elapsed_walltime()
         self._nextwall = self._wall.next_checkpoint(self._prevwall)
 
         if final and self._max_f_init_next_timestamp is not None:
             simulationtime = self._max_f_init_next_timestamp
+        else:
+            if next_timestamp is None:
+                simulationtime = timestamp
+            else:
+                simulationtime = next_timestamp
         self._prevsim = simulationtime
         self._nextsim = self._sim.next_checkpoint(simulationtime)
 
         self._should_have_saved = False
         self._saved_final_checkpoint = final
-        return self._prevwall
 
     def get_triggers(self) -> List[str]:
         """Get trigger description(s) for the current reason for checkpointing.
diff --git a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
index 6cb645c2..17afd0b1 100644
--- a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
@@ -162,7 +162,7 @@ def test_trigger_manager():
     assert "wallclocktime" in triggers[0]
     with pytest.raises(RuntimeError):  # did not call save in between
         trigger_manager.should_save_snapshot(t, t_next)
-    trigger_manager.update_checkpoints(t_next, False)
+    trigger_manager.update_checkpoints(t, t_next, False)
 
     t, t_next = 0.2, 0.9
     assert not trigger_manager.should_save_snapshot(t, t_next)
@@ -170,7 +170,7 @@ def test_trigger_manager():
     t, t_next = 0.9, 3.1
     assert trigger_manager.should_save_snapshot(t, t_next)
     assert len(trigger_manager.get_triggers()) == 1
-    trigger_manager.update_checkpoints(t_next, False)
+    trigger_manager.update_checkpoints(t, t_next, False)
 
     t, t_next = 3.1, None
     assert trigger_manager.should_save_final_snapshot(t)
@@ -179,7 +179,7 @@ def test_trigger_manager():
     with pytest.raises(RuntimeError):  # did not call save in between
         trigger_manager.should_save_final_snapshot(t)
     assert len(trigger_manager.get_triggers()) > 0
-    trigger_manager.update_checkpoints(t, True)
+    trigger_manager.update_checkpoints(t, t_next, True)
 
     trigger_manager.reuse_instance(None)
 
@@ -191,7 +191,7 @@ def test_trigger_manager():
     assert trigger_manager.should_save_final_snapshot(t)
     with pytest.raises(RuntimeError):  # not saved
         trigger_manager.reuse_instance(None)
-    trigger_manager.update_checkpoints(t, True)
+    trigger_manager.update_checkpoints(t, t_next, True)
 
     trigger_manager.reuse_instance(None)
 

From c67e2543378dd34a042ec28a371381f4ac890f07 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 30 Aug 2022 16:55:46 +0200
Subject: [PATCH 023/183] Implement SnapshotManager

---
 libmuscle/python/libmuscle/mmp_client.py      |   5 +
 libmuscle/python/libmuscle/snapshot.py        | 103 +++++++++
 .../python/libmuscle/snapshot_manager.py      | 215 ++++++++++++++++++
 3 files changed, 323 insertions(+)
 create mode 100644 libmuscle/python/libmuscle/snapshot.py
 create mode 100644 libmuscle/python/libmuscle/snapshot_manager.py

diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py
index fd236b52..0d3108c9 100644
--- a/libmuscle/python/libmuscle/mmp_client.py
+++ b/libmuscle/python/libmuscle/mmp_client.py
@@ -9,6 +9,7 @@
 from libmuscle.mcp.tcp_transport_client import TcpTransportClient
 from libmuscle.profiling import ProfileEvent
 from libmuscle.logging import LogMessage
+from libmuscle.snapshot import SnapshotMetadata
 
 
 CONNECTION_TIMEOUT = 300
@@ -93,6 +94,10 @@ def submit_profile_events(self, events: Iterable[ProfileEvent]) -> None:
                 [encode_profile_event(e) for e in events]]
         self._call_manager(request)
 
+    def submit_snapshot_metadata(self, snapshot_metadata: SnapshotMetadata
+                                 ) -> None:
+        ... # TODO
+
     def get_settings(self) -> Settings:
         """Get the central settings from the manager.
 
diff --git a/libmuscle/python/libmuscle/snapshot.py b/libmuscle/python/libmuscle/snapshot.py
new file mode 100644
index 00000000..f9e8966f
--- /dev/null
+++ b/libmuscle/python/libmuscle/snapshot.py
@@ -0,0 +1,103 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Dict, List, Optional, cast
+
+import msgpack
+
+if TYPE_CHECKING:
+    # prevent circular import
+    from libmuscle.communicator import Message
+
+
+class Snapshot(ABC):
+    """Snapshot data structure.
+
+    This is an abstract base class, implementations are provided by subclasses.
+    """
+    SNAPSHOT_VERSION_BYTE = b'\0'
+
+    def __init__(self,
+                 triggers: List[str],
+                 wallclocktime: float,
+                 port_message_counts: Dict[str, List[int]],
+                 is_final_snapshot: bool,
+                 message: 'Message') -> None:
+        self.triggers = triggers
+        self.wallclocktime = wallclocktime
+        self.port_message_counts = port_message_counts
+        self.is_final_snapshot = is_final_snapshot
+        self.message = message
+
+    @classmethod
+    @abstractmethod
+    def from_bytes(cls, data: bytes) -> 'Snapshot':
+        """Create a snapshot object from binary data.
+
+        Args:
+            data: binary data representing the snapshot. Note that this must
+                **exclude** the versioning byte.
+        """
+        ...
+
+    @abstractmethod
+    def to_bytes(self) -> bytes:
+        """Convert the snapshot object to binary data.
+
+        Returns:
+            Binary data representing the snapshot. Note that this must
+                **exclude** the versioning byte.
+        """
+        ...
+
+
+class MsgPackSnapshot(Snapshot):
+    """Snapshot stored in messagepack format
+    """
+    SNAPSHOT_VERSION_BYTE = b'1'
+
+    @classmethod
+    def from_bytes(cls, data: bytes) -> 'Snapshot':
+        dct = msgpack.loads(data)
+        return cls(dct['triggers'],
+                   dct['wallclocktime'],
+                   dct['port_message_counts'],
+                   dct['is_final_snapshot'],
+                   dct['message'])
+
+    def to_bytes(self) -> bytes:
+        return cast(bytes, msgpack.dumps({
+            'triggers': self.triggers,
+            'wallclocktime': self.wallclocktime,
+            'port_message_counts': self.port_message_counts,
+            'is_final_snapshot': self.is_final_snapshot,
+            'message': self.message
+        }))
+
+
+@dataclass
+class SnapshotMetadata:
+    """Metadata of a snapshot for sending to the muscle_manager.
+    """
+    triggers: List[str]
+    wallclocktime: float
+    timestamp: float
+    next_timestamp: Optional[float]
+    port_message_counts: Dict[str, List[int]]
+    is_final_snapshot: bool
+    # storing as str, because Path cannot be serialized by msgpack
+    snapshot_filename: str
+
+    @staticmethod
+    def from_snapshot(snapshot: Snapshot, snapshot_filename: str
+                      ) -> 'SnapshotMetadata':
+        """Create snapshot metadata from the given snapshot and filename
+        """
+        return SnapshotMetadata(
+            snapshot.triggers,
+            snapshot.wallclocktime,
+            snapshot.message.timestamp,
+            snapshot.message.next_timestamp,
+            snapshot.port_message_counts,
+            snapshot.is_final_snapshot,
+            snapshot_filename
+        )
diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py
new file mode 100644
index 00000000..afce7908
--- /dev/null
+++ b/libmuscle/python/libmuscle/snapshot_manager.py
@@ -0,0 +1,215 @@
+import logging
+from datetime import datetime
+from pathlib import Path
+from typing import Optional, cast
+
+from ymmsl import Checkpoints, Reference
+
+from libmuscle.checkpoint_triggers import TriggerManager
+from libmuscle.communicator import Communicator, Message
+from libmuscle.mmp_client import MMPClient
+from libmuscle.snapshot import MsgPackSnapshot, Snapshot, SnapshotMetadata
+
+_logger = logging.getLogger(__name__)
+
+_MAX_FILE_EXISTS_CHECK = 10000
+
+
+class SnapshotManager:
+    """Manages information on snapshots for the Instance
+
+    Implements the public checkpointing API with handoffs to
+    :class:`TriggerManager` for checkpoint triggers.
+    """
+
+    def __init__(self,
+                 instance_id: Reference,
+                 manager: MMPClient,
+                 communicator: Communicator) -> None:
+        """Create a new snapshot manager
+
+        Args:
+            instance_id: The id of this instance.
+            manager: The client used to submit data to the manager.
+            communicator: The communicator belonging to this instance.
+        """
+        self._instance_id = instance_id
+        # replace identifier[i] by identifier-i to use in snapshot file name
+        # using a dash (-) because that is not allowed in Identifiers
+        self._safe_id = str(instance_id).replace("[", "-").replace("]", "")
+        self._communicator = communicator
+        self._manager = manager
+
+        self._first_reuse = True
+        self._resume_from_snapshot = None   # type: Optional[Snapshot]
+        self._trigger = None                # type: Optional[TriggerManager]
+        self._snapshot_directory = None     # type: Optional[Path]
+        self._next_snapshot_num = 1
+
+    def registered(self,
+                   utc_reference: datetime,
+                   checkpoints: Checkpoints,
+                   resume: Optional[Path]) -> None:
+        """Callback after registering with the manager.
+
+        Provide the snapshot manager with info on workflow checkpoints and if we
+        should resume from a previous snapshot.
+
+        Args:
+            utc_reference: datetime (in UTC timezone) indicating wallclocktime=0
+            checkpoints: requested workflow checkpoints
+            resume: previous snapshot to resume from (or None if not resuming)
+        """
+        if checkpoints:
+            self._trigger = TriggerManager(utc_reference, checkpoints)
+        if resume is not None:
+            self.__load_snapshot(resume)
+            snapshot = cast(Snapshot, self._resume_from_snapshot)
+            self._communicator.restore_message_counts(
+                snapshot.port_message_counts)
+
+    def reuse_instance(self,
+                       max_f_init_next_timestamp: Optional[float],
+                       snapshot_directory: Path,
+                       ) -> None:
+        """Callback on Instance.reuse_instance
+
+        Args:
+            max_f_init_next_timestamp: maximum next_timestamp of all F_INIT
+                messages. May be None if no message has next_timestamp set or
+                if no F_INIT messages were received.
+        """
+        if self._trigger is not None:
+            self._trigger.reuse_instance(max_f_init_next_timestamp)
+
+        self._snapshot_directory = snapshot_directory
+
+        if self._first_reuse:
+            self._first_reuse = False
+        else:
+            self._resume_from_snapshot = None
+
+    def resuming(self) -> bool:
+        """Check if we are resuming during this reuse iteration.
+        """
+        return self._resume_from_snapshot is not None
+
+    def load_snapshot(self) -> Message:
+        """Get the Message to resume from
+        """
+        if self._resume_from_snapshot is None:
+            raise RuntimeError('No snapshot to load. Use "instance.resuming()"'
+                               ' to check if a snapshot is available')
+        return self._resume_from_snapshot.message
+
+    def should_save_snapshot(self, timestamp: float,
+                             next_timestamp: Optional[float]) -> bool:
+        """See :meth:`TriggerManager.should_save_snapshot`
+        """
+        if self._trigger is None:
+            return False  # checkpointing disabled
+        return self._trigger.should_save_snapshot(timestamp, next_timestamp)
+
+    def should_save_final_snapshot(self, timestamp: float) -> bool:
+        """See :meth:`TriggerManager.should_save_final_snapshot`
+        """
+        if self._trigger is None:
+            return False  # checkpointing disabled
+        return self._trigger.should_save_final_snapshot(timestamp)
+
+    def save_snapshot(self, msg: Message) -> None:
+        """Save snapshot contained in the message object.
+        """
+        self.__save_snapshot(msg, False)
+
+    def save_final_snapshot(self, msg: Message) -> None:
+        """Save final snapshot contained in the message object
+        """
+        self.__save_snapshot(msg, True)
+
+    def __save_snapshot(self, msg: Message, final: bool) -> None:
+        """Actual implementation used by save_(final_)snapshot.
+
+        Args:
+            msg: message object representing the snapshot
+            final: True iff called from save_final_snapshot
+        """
+        if self._trigger is None:
+            _logger.warning('Saving a snapshot but no checkpoints requested'
+                            ' by the workflow.')
+            triggers = []
+            wallclocktime = 0.0
+        else:
+            triggers = self._trigger.get_triggers()
+            wallclocktime = self._trigger.elapsed_walltime()
+
+        port_message_counts = self._communicator.get_message_counts()
+        snapshot = MsgPackSnapshot(
+            triggers, wallclocktime, port_message_counts, final, msg)
+
+        path = self.__store_snapshot(snapshot)
+        metadata = SnapshotMetadata.from_snapshot(snapshot, str(path))
+        self._manager.submit_snapshot_metadata(metadata)
+
+        if self._trigger is not None:
+            self._trigger.update_checkpoints(
+                msg.timestamp, msg.next_timestamp, final)
+
+    def __load_snapshot(self, snapshot_location: Path) -> None:
+        """Load a previously stored snapshot from the filesystem
+
+        Args:
+            snapshot_location: path where the snapshot is stored
+        """
+        if not snapshot_location.is_file():
+            raise RuntimeError(f'Unable to load snapshot: {snapshot_location}'
+                               ' is not a file. Please ensure this path exists'
+                               ' and can be read.')
+
+        # TODO: encapsulate I/O errors?
+        with snapshot_location.open("rb") as snapshot_file:
+            version = snapshot_file.read(1)
+            data = snapshot_file.read()
+
+            if version == MsgPackSnapshot.SNAPSHOT_VERSION_BYTE:
+                self._resume_from_snapshot = MsgPackSnapshot.from_bytes(data)
+            else:
+                raise RuntimeError('Unable to load snapshot from'
+                                   f' {snapshot_location}: unknown version of'
+                                   ' snapshot file. Was the file saved with a'
+                                   ' different version of libmuscle or'
+                                   ' tampered with?')
+
+    def __store_snapshot(self, snapshot: Snapshot) -> Path:
+        """Store a snapshot on the filesystem
+
+        Args:
+            snapshot: snapshot to store
+
+        Returns:
+            Path where the snapshot is stored
+        """
+        if self._snapshot_directory is None:
+            raise RuntimeError('Unknown snapshot directory. Did you try to'
+                               ' save a snapshot before entering the reuse'
+                               ' loop?')
+        for _ in range(_MAX_FILE_EXISTS_CHECK):
+            # Expectation is that muscle_snapshot_directory is empty initially
+            # and we succeed in the first loop. Still wrapping in a for-loop
+            # such that an existing filename doesn't immediately raise an error
+            fname = f"{self._safe_id}_{self._next_snapshot_num}.pack"
+            fpath = self._snapshot_directory / fname
+            self._next_snapshot_num += 1
+            if not fpath.exists():
+                break
+        else:
+            raise RuntimeError('Could not find an available filename for'
+                               f' storing the next snapshot: {fpath} already'
+                               ' exists.')
+        # Opening with mode 'x' since a file with the same name may be created
+        # in the small window between checking above and opening here. It is
+        # better to fail with an error than to overwrite an existing file.
+        with fpath.open('xb') as snapshot_file:
+            snapshot_file.write(snapshot.SNAPSHOT_VERSION_BYTE)
+            snapshot_file.write(snapshot.to_bytes())
+        return fpath

From f7bd7e11b69e7b6daade72374be442a8660f412a Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 30 Aug 2022 17:28:47 +0200
Subject: [PATCH 024/183] Fix flake8 issue

---
 libmuscle/python/libmuscle/mmp_client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py
index 0d3108c9..6771b1dd 100644
--- a/libmuscle/python/libmuscle/mmp_client.py
+++ b/libmuscle/python/libmuscle/mmp_client.py
@@ -96,7 +96,7 @@ def submit_profile_events(self, events: Iterable[ProfileEvent]) -> None:
 
     def submit_snapshot_metadata(self, snapshot_metadata: SnapshotMetadata
                                  ) -> None:
-        ... # TODO
+        ...  # TODO
 
     def get_settings(self) -> Settings:
         """Get the central settings from the manager.

From 9a7db1b9c63c6c43052b889d2b22a7e23eb7ac50 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 30 Aug 2022 17:29:24 +0200
Subject: [PATCH 025/183] Add snapshot tests

---
 libmuscle/python/libmuscle/snapshot.py        | 31 +++++++++---
 .../python/libmuscle/test/test_snapshot.py    | 50 +++++++++++++++++++
 2 files changed, 74 insertions(+), 7 deletions(-)
 create mode 100644 libmuscle/python/libmuscle/test/test_snapshot.py

diff --git a/libmuscle/python/libmuscle/snapshot.py b/libmuscle/python/libmuscle/snapshot.py
index f9e8966f..560e1129 100644
--- a/libmuscle/python/libmuscle/snapshot.py
+++ b/libmuscle/python/libmuscle/snapshot.py
@@ -1,12 +1,12 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Dict, List, Optional, cast
+from typing import Dict, List, Optional, cast
 
 import msgpack
+from ymmsl import Reference, Settings
 
-if TYPE_CHECKING:
-    # prevent circular import
-    from libmuscle.communicator import Message
+from libmuscle.mpp_message import MPPMessage
+from libmuscle import communicator
 
 
 class Snapshot(ABC):
@@ -21,7 +21,7 @@ def __init__(self,
                  wallclocktime: float,
                  port_message_counts: Dict[str, List[int]],
                  is_final_snapshot: bool,
-                 message: 'Message') -> None:
+                 message: 'communicator.Message') -> None:
         self.triggers = triggers
         self.wallclocktime = wallclocktime
         self.port_message_counts = port_message_counts
@@ -62,7 +62,7 @@ def from_bytes(cls, data: bytes) -> 'Snapshot':
                    dct['wallclocktime'],
                    dct['port_message_counts'],
                    dct['is_final_snapshot'],
-                   dct['message'])
+                   cls.bytes_to_message(dct['message']))
 
     def to_bytes(self) -> bytes:
         return cast(bytes, msgpack.dumps({
@@ -70,9 +70,26 @@ def to_bytes(self) -> bytes:
             'wallclocktime': self.wallclocktime,
             'port_message_counts': self.port_message_counts,
             'is_final_snapshot': self.is_final_snapshot,
-            'message': self.message
+            'message': self.message_to_bytes(self.message)
         }))
 
+    @staticmethod
+    def message_to_bytes(message: 'communicator.Message') -> bytes:
+        """Use MPPMessage serializer for serializing the message object
+        """
+        return MPPMessage(Reference('_'), Reference('_'), None,
+                          message.timestamp, message.next_timestamp,
+                          Settings(), 0, message.data).encoded()
+
+    @staticmethod
+    def bytes_to_message(data: bytes) -> 'communicator.Message':
+        """Use MPPMessage deserializer for serializing the message object
+        """
+        mpp_message = MPPMessage.from_bytes(data)
+        return communicator.Message(mpp_message.timestamp,
+                                    mpp_message.next_timestamp,
+                                    mpp_message.data)
+
 
 @dataclass
 class SnapshotMetadata:
diff --git a/libmuscle/python/libmuscle/test/test_snapshot.py b/libmuscle/python/libmuscle/test/test_snapshot.py
new file mode 100644
index 00000000..b238df44
--- /dev/null
+++ b/libmuscle/python/libmuscle/test/test_snapshot.py
@@ -0,0 +1,50 @@
+import pytest
+
+from libmuscle.communicator import Message
+from libmuscle.snapshot import Snapshot, MsgPackSnapshot, SnapshotMetadata
+
+
+@pytest.fixture
+def snapshot() -> Snapshot:
+    triggers = ["test triggers"]
+    wallclocktime = 15.3
+    port_message_counts = {'in': [1], 'out': [4], 'muscle_settings_in': [0]}
+    is_final = True
+    message = Message(1.2, None, "test_data")
+    snapshot = MsgPackSnapshot(
+            triggers, wallclocktime, port_message_counts, is_final, message)
+    assert snapshot.triggers == triggers
+    assert snapshot.wallclocktime == wallclocktime
+    assert snapshot.port_message_counts == port_message_counts
+    assert snapshot.is_final_snapshot == is_final
+    assert snapshot.message == message
+    return snapshot
+
+
+def test_snapshot(snapshot: Snapshot) -> None:
+    assert isinstance(snapshot, Snapshot)
+
+    binary_snapshot = snapshot.to_bytes()
+    assert isinstance(binary_snapshot, bytes)
+
+    snapshot2 = MsgPackSnapshot.from_bytes(binary_snapshot)
+
+    assert snapshot2.triggers == snapshot.triggers
+    assert snapshot2.wallclocktime == snapshot.wallclocktime
+    assert snapshot2.port_message_counts == snapshot.port_message_counts
+    assert snapshot2.is_final_snapshot == snapshot.is_final_snapshot
+    assert snapshot2.message.timestamp == snapshot.message.timestamp
+    assert snapshot2.message.next_timestamp == snapshot.message.next_timestamp
+    assert snapshot2.message.data == snapshot.message.data
+
+
+def test_snapshot_metadata(snapshot: Snapshot) -> None:
+    metadata = SnapshotMetadata.from_snapshot(snapshot, "test")
+
+    assert metadata.triggers == snapshot.triggers
+    assert metadata.wallclocktime == snapshot.wallclocktime
+    assert metadata.port_message_counts == snapshot.port_message_counts
+    assert metadata.is_final_snapshot == snapshot.is_final_snapshot
+    assert metadata.timestamp == snapshot.message.timestamp
+    assert metadata.next_timestamp == snapshot.message.next_timestamp
+    assert metadata.snapshot_filename == "test"

From 773d1284f67eb512be0ab3b902806fbd0d9aa89c Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Wed, 31 Aug 2022 10:45:48 +0200
Subject: [PATCH 026/183] Add tests for SnapshotManager & fix bugs

---
 .../python/libmuscle/checkpoint_triggers.py   |   3 +
 .../python/libmuscle/snapshot_manager.py      |  11 +-
 .../libmuscle/test/test_snapshot_manager.py   | 100 ++++++++++++++++++
 3 files changed, 111 insertions(+), 3 deletions(-)
 create mode 100644 libmuscle/python/libmuscle/test/test_snapshot_manager.py

diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py
index 8bf343fb..b3715525 100644
--- a/libmuscle/python/libmuscle/checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/checkpoint_triggers.py
@@ -302,6 +302,9 @@ def update_checkpoints(self, timestamp: float,
         self._prevsim = simulationtime
         self._nextsim = self._sim.next_checkpoint(simulationtime)
 
+        # this method is also called during resume, after which we no longer
+        # consider the simulationtime as reset
+        self._sim_reset = False
         self._should_have_saved = False
         self._saved_final_checkpoint = final
 
diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py
index afce7908..e17cb460 100644
--- a/libmuscle/python/libmuscle/snapshot_manager.py
+++ b/libmuscle/python/libmuscle/snapshot_manager.py
@@ -67,6 +67,11 @@ def registered(self,
             snapshot = cast(Snapshot, self._resume_from_snapshot)
             self._communicator.restore_message_counts(
                 snapshot.port_message_counts)
+            if self._trigger:
+                self._trigger.update_checkpoints(
+                    snapshot.message.timestamp,
+                    snapshot.message.next_timestamp,
+                    snapshot.is_final_snapshot)
 
     def reuse_instance(self,
                        max_f_init_next_timestamp: Optional[float],
@@ -135,8 +140,8 @@ def __save_snapshot(self, msg: Message, final: bool) -> None:
             final: True iff called from save_final_snapshot
         """
         if self._trigger is None:
-            _logger.warning('Saving a snapshot but no checkpoints requested'
-                            ' by the workflow.')
+            _logger.info('Saving a snapshot but no checkpoints requested'
+                         ' by the workflow.')
             triggers = []
             wallclocktime = 0.0
         else:
@@ -167,7 +172,7 @@ def __load_snapshot(self, snapshot_location: Path) -> None:
                                ' and can be read.')
 
         # TODO: encapsulate I/O errors?
-        with snapshot_location.open("rb") as snapshot_file:
+        with snapshot_location.open('rb') as snapshot_file:
             version = snapshot_file.read(1)
             data = snapshot_file.read()
 
diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
new file mode 100644
index 00000000..b5a8edde
--- /dev/null
+++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
@@ -0,0 +1,100 @@
+from datetime import datetime, timezone
+import logging
+from pathlib import Path
+from unittest.mock import MagicMock
+
+import pytest
+from ymmsl import Reference, Checkpoints, CheckpointRules
+
+from libmuscle.communicator import Message
+from libmuscle.snapshot import SnapshotMetadata
+from libmuscle.snapshot_manager import SnapshotManager
+
+
+def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path
+                          ) -> None:
+    manager = MagicMock()
+    communicator = MagicMock()
+    communicator.get_message_counts.return_value = {}
+    snapshot_manager = SnapshotManager(Reference('test'), manager, communicator)
+
+    snapshot_manager.registered(datetime.now(timezone.utc), Checkpoints(), None)
+
+    snapshot_manager.reuse_instance(None, Path(tmp_path))
+    assert not snapshot_manager.resuming()
+    assert not snapshot_manager.should_save_snapshot(1, None)
+    assert not snapshot_manager.should_save_snapshot(5000, None)
+    assert not snapshot_manager.should_save_final_snapshot(1000)
+
+    with caplog.at_level(logging.INFO, 'libmuscle.snapshot_manager'):
+        snapshot_manager.save_snapshot(Message(1.0, None, None))
+        assert caplog.records[0].levelname == "INFO"
+        assert "no checkpoints" in caplog.records[0].message
+
+
+def test_save_load_checkpoint(tmp_path: Path) -> None:
+    manager = MagicMock()
+    communicator = MagicMock()
+    port_message_counts = {'in': [1], 'out': [2], 'muscle_settings_in': [0]}
+    communicator.get_message_counts.return_value = port_message_counts
+
+    instance_id = Reference('test[1]')
+    snapshot_manager = SnapshotManager(instance_id, manager, communicator)
+
+    checkpoints = Checkpoints(simulationtime=CheckpointRules(every=1))
+    snapshot_manager.registered(datetime.now(timezone.utc), checkpoints, None)
+
+    snapshot_manager.reuse_instance(None, tmp_path)
+    with pytest.raises(RuntimeError):
+        snapshot_manager.load_snapshot()
+
+    assert not snapshot_manager.resuming()
+    assert snapshot_manager.should_save_snapshot(0.2, 0.4)
+    snapshot_manager.save_snapshot(Message(0.2, 0.4, 'test data'))
+
+    communicator.get_message_counts.assert_called_with()
+    manager.submit_snapshot_metadata.assert_called()
+    metadata = manager.submit_snapshot_metadata.call_args.args[0]
+    assert isinstance(metadata, SnapshotMetadata)
+    assert metadata.triggers
+    assert metadata.wallclocktime > 0.0
+    assert metadata.timestamp == 0.2
+    assert metadata.next_timestamp == 0.4
+    assert metadata.port_message_counts == port_message_counts
+    assert not metadata.is_final_snapshot
+    fpath = Path(metadata.snapshot_filename)
+    assert fpath.parent == tmp_path
+    assert fpath.name == 'test-1_1.pack'
+
+    snapshot_manager2 = SnapshotManager(instance_id, manager, communicator)
+
+    snapshot_manager2.registered(datetime.now(timezone.utc), checkpoints, fpath)
+    communicator.restore_message_counts.assert_called_with(port_message_counts)
+
+    assert snapshot_manager2.resuming()
+    snapshot_manager2.reuse_instance(None, tmp_path)
+    assert snapshot_manager2.resuming()
+    msg = snapshot_manager2.load_snapshot()
+    assert msg.timestamp == 0.2
+    assert msg.next_timestamp == 0.4
+    assert msg.data == 'test data'
+
+    assert not snapshot_manager2.should_save_snapshot(0.4, 0.6)
+    assert snapshot_manager2.should_save_final_snapshot(0.6)
+    snapshot_manager2.save_final_snapshot(Message(0.6, None, 'test data2'))
+
+    metadata = manager.submit_snapshot_metadata.call_args.args[0]
+    assert isinstance(metadata, SnapshotMetadata)
+    assert metadata.triggers
+    assert metadata.wallclocktime > 0.0
+    assert metadata.timestamp == 0.6
+    assert metadata.next_timestamp is None
+    assert metadata.port_message_counts == port_message_counts
+    assert metadata.is_final_snapshot
+    fpath = Path(metadata.snapshot_filename)
+    assert fpath.parent == tmp_path
+    assert fpath.name == 'test-1_2.pack'
+
+    assert snapshot_manager2.resuming()
+    snapshot_manager2.reuse_instance(None, tmp_path)
+    assert not snapshot_manager2.resuming()

From cbb6b4f4378fd718cd72e63678ca260f44a4bd24 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Wed, 31 Aug 2022 10:46:23 +0200
Subject: [PATCH 027/183] Allow saving snapshot messages with Settings

---
 libmuscle/python/libmuscle/snapshot.py        |  8 +++++--
 .../python/libmuscle/test/test_snapshot.py    | 21 +++++++++++++++----
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/libmuscle/python/libmuscle/snapshot.py b/libmuscle/python/libmuscle/snapshot.py
index 560e1129..324ab76f 100644
--- a/libmuscle/python/libmuscle/snapshot.py
+++ b/libmuscle/python/libmuscle/snapshot.py
@@ -77,9 +77,12 @@ def to_bytes(self) -> bytes:
     def message_to_bytes(message: 'communicator.Message') -> bytes:
         """Use MPPMessage serializer for serializing the message object
         """
+        settings = Settings()
+        if message.settings is not None:
+            settings = message.settings
         return MPPMessage(Reference('_'), Reference('_'), None,
                           message.timestamp, message.next_timestamp,
-                          Settings(), 0, message.data).encoded()
+                          settings, 0, message.data).encoded()
 
     @staticmethod
     def bytes_to_message(data: bytes) -> 'communicator.Message':
@@ -88,7 +91,8 @@ def bytes_to_message(data: bytes) -> 'communicator.Message':
         mpp_message = MPPMessage.from_bytes(data)
         return communicator.Message(mpp_message.timestamp,
                                     mpp_message.next_timestamp,
-                                    mpp_message.data)
+                                    mpp_message.data,
+                                    mpp_message.settings_overlay)
 
 
 @dataclass
diff --git a/libmuscle/python/libmuscle/test/test_snapshot.py b/libmuscle/python/libmuscle/test/test_snapshot.py
index b238df44..82c0d6a5 100644
--- a/libmuscle/python/libmuscle/test/test_snapshot.py
+++ b/libmuscle/python/libmuscle/test/test_snapshot.py
@@ -1,4 +1,5 @@
 import pytest
+from ymmsl import Settings
 
 from libmuscle.communicator import Message
 from libmuscle.snapshot import Snapshot, MsgPackSnapshot, SnapshotMetadata
@@ -6,11 +7,11 @@
 
 @pytest.fixture
 def snapshot() -> Snapshot:
-    triggers = ["test triggers"]
+    triggers = ['test triggers']
     wallclocktime = 15.3
     port_message_counts = {'in': [1], 'out': [4], 'muscle_settings_in': [0]}
     is_final = True
-    message = Message(1.2, None, "test_data")
+    message = Message(1.2, None, 'test_data')
     snapshot = MsgPackSnapshot(
             triggers, wallclocktime, port_message_counts, is_final, message)
     assert snapshot.triggers == triggers
@@ -39,7 +40,7 @@ def test_snapshot(snapshot: Snapshot) -> None:
 
 
 def test_snapshot_metadata(snapshot: Snapshot) -> None:
-    metadata = SnapshotMetadata.from_snapshot(snapshot, "test")
+    metadata = SnapshotMetadata.from_snapshot(snapshot, 'test')
 
     assert metadata.triggers == snapshot.triggers
     assert metadata.wallclocktime == snapshot.wallclocktime
@@ -47,4 +48,16 @@ def test_snapshot_metadata(snapshot: Snapshot) -> None:
     assert metadata.is_final_snapshot == snapshot.is_final_snapshot
     assert metadata.timestamp == snapshot.message.timestamp
     assert metadata.next_timestamp == snapshot.message.next_timestamp
-    assert metadata.snapshot_filename == "test"
+    assert metadata.snapshot_filename == 'test'
+
+
+def test_message_with_settings() -> None:
+    message = Message(1.0, 2.0, 'test_data', Settings({'setting': True}))
+    snapshot = MsgPackSnapshot([], 0, {}, False, message)
+    assert snapshot.message.settings.get('setting') is True
+
+    binary_snapshot = snapshot.to_bytes()
+    assert isinstance(binary_snapshot, bytes)
+
+    snapshot2 = MsgPackSnapshot.from_bytes(binary_snapshot)
+    assert snapshot2.message.settings.get('setting') is True

From 1beb84798cbd3e5f708f38e66b0c063c69cd8977 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Wed, 31 Aug 2022 11:01:14 +0200
Subject: [PATCH 028/183] Rename wallclocktime/simulationtime to *_time

See related commit in ymmsl-python:
https://github.com/multiscale/ymmsl-python/commit/2b0401969a8b7c8ae807f388aee2320c2c8b57b4
---
 .../python/libmuscle/checkpoint_triggers.py   | 32 +++++++++----------
 libmuscle/python/libmuscle/snapshot.py        | 12 +++----
 .../python/libmuscle/snapshot_manager.py      |  8 ++---
 .../test/test_checkpoint_triggers.py          |  8 ++---
 .../python/libmuscle/test/test_snapshot.py    | 10 +++---
 .../libmuscle/test/test_snapshot_manager.py   |  6 ++--
 6 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py
index b3715525..dbb8fcb4 100644
--- a/libmuscle/python/libmuscle/checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/checkpoint_triggers.py
@@ -187,11 +187,11 @@ def __init__(self, utc_reference: datetime, checkpoints: Checkpoints
                  ) -> None:
         self._monotonic_reference = _utc_to_monotonic(utc_reference)
 
-        self._wall = CombinedCheckpointTriggers(checkpoints.wallclocktime)
+        self._wall = CombinedCheckpointTriggers(checkpoints.wallclock_time)
         self._prevwall = 0.0
         self._nextwall = self._wall.next_checkpoint(0.0)  # type: Optional[float]
 
-        self._sim = CombinedCheckpointTriggers(checkpoints.simulationtime)
+        self._sim = CombinedCheckpointTriggers(checkpoints.simulation_time)
         self._prevsim = None        # type: Optional[float]
         self._nextsim = None        # type: Optional[float]
         self._sim_reset = True
@@ -206,7 +206,7 @@ def __init__(self, utc_reference: datetime, checkpoints: Checkpoints
         self._saved_final_checkpoint = False
 
     def elapsed_walltime(self) -> float:
-        """Returns elapsed wallclocktime in seconds.
+        """Returns elapsed wallclock_time in seconds.
         """
         return time.monotonic() - self._monotonic_reference
 
@@ -293,17 +293,17 @@ def update_checkpoints(self, timestamp: float,
         self._nextwall = self._wall.next_checkpoint(self._prevwall)
 
         if final and self._max_f_init_next_timestamp is not None:
-            simulationtime = self._max_f_init_next_timestamp
+            simulation_time = self._max_f_init_next_timestamp
         else:
             if next_timestamp is None:
-                simulationtime = timestamp
+                simulation_time = timestamp
             else:
-                simulationtime = next_timestamp
-        self._prevsim = simulationtime
-        self._nextsim = self._sim.next_checkpoint(simulationtime)
+                simulation_time = next_timestamp
+        self._prevsim = simulation_time
+        self._nextsim = self._sim.next_checkpoint(simulation_time)
 
         # this method is also called during resume, after which we no longer
-        # consider the simulationtime as reset
+        # consider the simulation_time as reset
         self._sim_reset = False
         self._should_have_saved = False
         self._saved_final_checkpoint = final
@@ -315,29 +315,29 @@ def get_triggers(self) -> List[str]:
         self._last_triggers = []
         return triggers
 
-    def __should_save(self, walltime: float, simulationtime: float) -> bool:
+    def __should_save(self, walltime: float, simulation_time: float) -> bool:
         """Check if a checkpoint should be taken
 
         Args:
             walltime: current wallclock time (elapsed since reference)
-            simulationtime: current/next timestamp as reported by the instance
+            simulation_time: current/next timestamp as reported by the instance
         """
         if self._sim_reset:
             # we cannot make assumptions about the start time of a simulation,
             # a t=-1000 could make sense if t represents years since CE
             # and we should not disallow checkpointing for negative t
-            previous = self._sim.previous_checkpoint(simulationtime)
+            previous = self._sim.previous_checkpoint(simulation_time)
             if previous is not None:
                 # there is a checkpoint rule before the current moment, assume
                 # we should have taken a snapshot back then
                 self._nextsim = previous
             else:
-                self._nextsim = self._sim.next_checkpoint(simulationtime)
+                self._nextsim = self._sim.next_checkpoint(simulation_time)
             self._sim_reset = False
 
         self._last_triggers = []
         if self._nextwall is not None and walltime >= self._nextwall:
-            self._last_triggers.append(f"wallclocktime >= {self._nextwall}")
-        if self._nextsim is not None and simulationtime >= self._nextsim:
-            self._last_triggers.append(f"simulationtime >= {self._nextsim}")
+            self._last_triggers.append(f"wallclock_time >= {self._nextwall}")
+        if self._nextsim is not None and simulation_time >= self._nextsim:
+            self._last_triggers.append(f"simulation_time >= {self._nextsim}")
         return bool(self._last_triggers)
diff --git a/libmuscle/python/libmuscle/snapshot.py b/libmuscle/python/libmuscle/snapshot.py
index 324ab76f..93ed9307 100644
--- a/libmuscle/python/libmuscle/snapshot.py
+++ b/libmuscle/python/libmuscle/snapshot.py
@@ -18,12 +18,12 @@ class Snapshot(ABC):
 
     def __init__(self,
                  triggers: List[str],
-                 wallclocktime: float,
+                 wallclock_time: float,
                  port_message_counts: Dict[str, List[int]],
                  is_final_snapshot: bool,
                  message: 'communicator.Message') -> None:
         self.triggers = triggers
-        self.wallclocktime = wallclocktime
+        self.wallclock_time = wallclock_time
         self.port_message_counts = port_message_counts
         self.is_final_snapshot = is_final_snapshot
         self.message = message
@@ -59,7 +59,7 @@ class MsgPackSnapshot(Snapshot):
     def from_bytes(cls, data: bytes) -> 'Snapshot':
         dct = msgpack.loads(data)
         return cls(dct['triggers'],
-                   dct['wallclocktime'],
+                   dct['wallclock_time'],
                    dct['port_message_counts'],
                    dct['is_final_snapshot'],
                    cls.bytes_to_message(dct['message']))
@@ -67,7 +67,7 @@ def from_bytes(cls, data: bytes) -> 'Snapshot':
     def to_bytes(self) -> bytes:
         return cast(bytes, msgpack.dumps({
             'triggers': self.triggers,
-            'wallclocktime': self.wallclocktime,
+            'wallclock_time': self.wallclock_time,
             'port_message_counts': self.port_message_counts,
             'is_final_snapshot': self.is_final_snapshot,
             'message': self.message_to_bytes(self.message)
@@ -100,7 +100,7 @@ class SnapshotMetadata:
     """Metadata of a snapshot for sending to the muscle_manager.
     """
     triggers: List[str]
-    wallclocktime: float
+    wallclock_time: float
     timestamp: float
     next_timestamp: Optional[float]
     port_message_counts: Dict[str, List[int]]
@@ -115,7 +115,7 @@ def from_snapshot(snapshot: Snapshot, snapshot_filename: str
         """
         return SnapshotMetadata(
             snapshot.triggers,
-            snapshot.wallclocktime,
+            snapshot.wallclock_time,
             snapshot.message.timestamp,
             snapshot.message.next_timestamp,
             snapshot.port_message_counts,
diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py
index e17cb460..f8477637 100644
--- a/libmuscle/python/libmuscle/snapshot_manager.py
+++ b/libmuscle/python/libmuscle/snapshot_manager.py
@@ -56,7 +56,7 @@ def registered(self,
         should resume from a previous snapshot.
 
         Args:
-            utc_reference: datetime (in UTC timezone) indicating wallclocktime=0
+            utc_reference: datetime (in UTC) indicating wallclock_time=0
             checkpoints: requested workflow checkpoints
             resume: previous snapshot to resume from (or None if not resuming)
         """
@@ -143,14 +143,14 @@ def __save_snapshot(self, msg: Message, final: bool) -> None:
             _logger.info('Saving a snapshot but no checkpoints requested'
                          ' by the workflow.')
             triggers = []
-            wallclocktime = 0.0
+            wallclock_time = 0.0
         else:
             triggers = self._trigger.get_triggers()
-            wallclocktime = self._trigger.elapsed_walltime()
+            wallclock_time = self._trigger.elapsed_walltime()
 
         port_message_counts = self._communicator.get_message_counts()
         snapshot = MsgPackSnapshot(
-            triggers, wallclocktime, port_message_counts, final, msg)
+            triggers, wallclock_time, port_message_counts, final, msg)
 
         path = self.__store_snapshot(snapshot)
         metadata = SnapshotMetadata.from_snapshot(snapshot, str(path))
diff --git a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
index 17afd0b1..41577ad3 100644
--- a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
@@ -150,8 +150,8 @@ def test_trigger_manager_reference_time():
 def test_trigger_manager():
     reference = datetime.now(timezone.utc)
     trigger_manager = TriggerManager(reference, Checkpoints(
-            wallclocktime=CheckpointRules(at=[1e-12]),
-            simulationtime=CheckpointRules(at=[1, 3, 5])))
+            wallclock_time=CheckpointRules(at=[1e-12]),
+            simulation_time=CheckpointRules(at=[1, 3, 5])))
 
     trigger_manager.reuse_instance(7)
 
@@ -159,7 +159,7 @@ def test_trigger_manager():
     assert trigger_manager.should_save_snapshot(t, t_next)
     triggers = trigger_manager.get_triggers()
     assert len(triggers) == 1
-    assert "wallclocktime" in triggers[0]
+    assert "wallclock_time" in triggers[0]
     with pytest.raises(RuntimeError):  # did not call save in between
         trigger_manager.should_save_snapshot(t, t_next)
     trigger_manager.update_checkpoints(t, t_next, False)
@@ -202,7 +202,7 @@ def test_trigger_manager_warnings(caplog: pytest.LogCaptureFixture,
 
     reference = datetime.now(timezone.utc)
     trigger_manager = TriggerManager(reference, Checkpoints(
-            simulationtime=CheckpointRules(at=[1, 3, 5])))
+            simulation_time=CheckpointRules(at=[1, 3, 5])))
 
     trigger_manager.reuse_instance(2)
 
diff --git a/libmuscle/python/libmuscle/test/test_snapshot.py b/libmuscle/python/libmuscle/test/test_snapshot.py
index 82c0d6a5..c959a226 100644
--- a/libmuscle/python/libmuscle/test/test_snapshot.py
+++ b/libmuscle/python/libmuscle/test/test_snapshot.py
@@ -8,14 +8,14 @@
 @pytest.fixture
 def snapshot() -> Snapshot:
     triggers = ['test triggers']
-    wallclocktime = 15.3
+    wallclock_time = 15.3
     port_message_counts = {'in': [1], 'out': [4], 'muscle_settings_in': [0]}
     is_final = True
     message = Message(1.2, None, 'test_data')
     snapshot = MsgPackSnapshot(
-            triggers, wallclocktime, port_message_counts, is_final, message)
+            triggers, wallclock_time, port_message_counts, is_final, message)
     assert snapshot.triggers == triggers
-    assert snapshot.wallclocktime == wallclocktime
+    assert snapshot.wallclock_time == wallclock_time
     assert snapshot.port_message_counts == port_message_counts
     assert snapshot.is_final_snapshot == is_final
     assert snapshot.message == message
@@ -31,7 +31,7 @@ def test_snapshot(snapshot: Snapshot) -> None:
     snapshot2 = MsgPackSnapshot.from_bytes(binary_snapshot)
 
     assert snapshot2.triggers == snapshot.triggers
-    assert snapshot2.wallclocktime == snapshot.wallclocktime
+    assert snapshot2.wallclock_time == snapshot.wallclock_time
     assert snapshot2.port_message_counts == snapshot.port_message_counts
     assert snapshot2.is_final_snapshot == snapshot.is_final_snapshot
     assert snapshot2.message.timestamp == snapshot.message.timestamp
@@ -43,7 +43,7 @@ def test_snapshot_metadata(snapshot: Snapshot) -> None:
     metadata = SnapshotMetadata.from_snapshot(snapshot, 'test')
 
     assert metadata.triggers == snapshot.triggers
-    assert metadata.wallclocktime == snapshot.wallclocktime
+    assert metadata.wallclock_time == snapshot.wallclock_time
     assert metadata.port_message_counts == snapshot.port_message_counts
     assert metadata.is_final_snapshot == snapshot.is_final_snapshot
     assert metadata.timestamp == snapshot.message.timestamp
diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
index b5a8edde..7ac0972f 100644
--- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py
+++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
@@ -41,7 +41,7 @@ def test_save_load_checkpoint(tmp_path: Path) -> None:
     instance_id = Reference('test[1]')
     snapshot_manager = SnapshotManager(instance_id, manager, communicator)
 
-    checkpoints = Checkpoints(simulationtime=CheckpointRules(every=1))
+    checkpoints = Checkpoints(simulation_time=CheckpointRules(every=1))
     snapshot_manager.registered(datetime.now(timezone.utc), checkpoints, None)
 
     snapshot_manager.reuse_instance(None, tmp_path)
@@ -57,7 +57,7 @@ def test_save_load_checkpoint(tmp_path: Path) -> None:
     metadata = manager.submit_snapshot_metadata.call_args.args[0]
     assert isinstance(metadata, SnapshotMetadata)
     assert metadata.triggers
-    assert metadata.wallclocktime > 0.0
+    assert metadata.wallclock_time > 0.0
     assert metadata.timestamp == 0.2
     assert metadata.next_timestamp == 0.4
     assert metadata.port_message_counts == port_message_counts
@@ -86,7 +86,7 @@ def test_save_load_checkpoint(tmp_path: Path) -> None:
     metadata = manager.submit_snapshot_metadata.call_args.args[0]
     assert isinstance(metadata, SnapshotMetadata)
     assert metadata.triggers
-    assert metadata.wallclocktime > 0.0
+    assert metadata.wallclock_time > 0.0
     assert metadata.timestamp == 0.6
     assert metadata.next_timestamp is None
     assert metadata.port_message_counts == port_message_counts

From 07f3b86bfb0c65d4f014d0c6ccd02562d31af044 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Wed, 31 Aug 2022 11:08:08 +0200
Subject: [PATCH 029/183] Add dataclasses backport as dependency for py3.6

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index ade2fa38..b131fb1e 100644
--- a/setup.py
+++ b/setup.py
@@ -55,6 +55,7 @@
         "numpy>=1.22,<=1.25; python_version>='3.8'",
         'qcg-pilotjob==0.13.1',
         'typing_extensions<4',
+        "dataclasses; python_version=='3.6'",
         'ymmsl>=0.12.0,<0.13'          # Also in CI, update there as well
     ],
     extras_require={

From b6b0ef52e6fb69d15b7f17345182497f6a751238 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Wed, 31 Aug 2022 11:13:40 +0200
Subject: [PATCH 030/183] types-dataclasses dependency in tox.ini for py3.6

---
 tox.ini | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tox.ini b/tox.ini
index 23fb19f3..717d5107 100644
--- a/tox.ini
+++ b/tox.ini
@@ -9,6 +9,7 @@ deps =
     pytest
     pytest-cov
     git+https://github.com/multiscale/ymmsl-python.git@feature/checkpointing#egg=ymmsl
+    types-dataclasses; python_version=='3.6'
 
 passenv =
     MUSCLE_TEST_PYTHON_ONLY

From 65036e4ca8df199549e51882dcb5dcb35ef9ad6a Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Wed, 31 Aug 2022 11:22:16 +0200
Subject: [PATCH 031/183] Rewrite MagicMock.call_args.args (py3.8+ only)

---
 libmuscle/python/libmuscle/test/test_snapshot_manager.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
index 7ac0972f..a8223e53 100644
--- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py
+++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
@@ -54,7 +54,7 @@ def test_save_load_checkpoint(tmp_path: Path) -> None:
 
     communicator.get_message_counts.assert_called_with()
     manager.submit_snapshot_metadata.assert_called()
-    metadata = manager.submit_snapshot_metadata.call_args.args[0]
+    metadata = manager.submit_snapshot_metadata.call_args[0][0]
     assert isinstance(metadata, SnapshotMetadata)
     assert metadata.triggers
     assert metadata.wallclock_time > 0.0
@@ -83,7 +83,7 @@ def test_save_load_checkpoint(tmp_path: Path) -> None:
     assert snapshot_manager2.should_save_final_snapshot(0.6)
     snapshot_manager2.save_final_snapshot(Message(0.6, None, 'test data2'))
 
-    metadata = manager.submit_snapshot_metadata.call_args.args[0]
+    metadata = manager.submit_snapshot_metadata.call_args[0][0]
     assert isinstance(metadata, SnapshotMetadata)
     assert metadata.triggers
     assert metadata.wallclock_time > 0.0

From 98d11311fa7b8ca332c4b114c9102f0675073e02 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Wed, 31 Aug 2022 17:24:03 +0200
Subject: [PATCH 032/183] Add checkpointing API and to Instance

---
 libmuscle/python/libmuscle/instance.py        | 192 +++++++++++++++++-
 libmuscle/python/libmuscle/mmp_client.py      |  11 +-
 .../python/libmuscle/snapshot_manager.py      |  10 +-
 .../python/libmuscle/test/test_instance.py    |   9 +-
 .../libmuscle/test/test_snapshot_manager.py   |   9 +-
 5 files changed, 214 insertions(+), 17 deletions(-)

diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index 513018d6..f91bae55 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -1,13 +1,15 @@
 from copy import copy
+from datetime import datetime
 import logging
 import os
+from pathlib import Path
 import sys
 from typing import cast, Dict, List, Optional, Tuple, overload
 # TODO: import from typing module when dropping support for python 3.7
 from typing_extensions import Literal
 
 from ymmsl import (Identifier, Operator, SettingValue, Port, Reference,
-                   Settings)
+                   Settings, Checkpoints)
 
 from libmuscle.communicator import Communicator, Message
 from libmuscle.settings_manager import SettingsManager
@@ -17,6 +19,7 @@
 from libmuscle.mmp_client import MMPClient
 from libmuscle.profiler import Profiler
 from libmuscle.profiling import ProfileEventType
+from libmuscle.snapshot_manager import SnapshotManager
 from libmuscle.util import extract_log_file_location
 
 
@@ -65,13 +68,20 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None
         self._settings_manager = SettingsManager()
         """Settings for this instance."""
 
+        self._snapshot_manager = SnapshotManager(
+                self._instance_name(), self.__manager, self._communicator)
+        """Keeps track of checkpointing and snapshots"""
+
         self._first_run = True
         """Keeps track of whether this is the first reuse run."""
 
         self._f_init_cache = dict()     # type: _FInitCacheType
 
-        self._register()
+        checkpoint_info = self._register()
         self._connect()
+        # Note: SnapshotManager.set_checkpoint_info needs to have the ports
+        # initialized so it comes after self._connect()
+        self._snapshot_manager.set_checkpoint_info(*checkpoint_info)
         self._set_local_log_level()
         self._set_remote_log_level()
 
@@ -107,6 +117,17 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool:
                 and everything will be fine. If it turns out that you
                 did need to specify False, MUSCLE3 will tell you about
                 it in an error message and you can add it still.
+
+        Raises:
+            RuntimeError:
+                When implementing the checkpointing API, but libmuscle detected
+                incorrect API calls. The description of the RuntimeError
+                indicates which calls are incorrect or missing. For more
+                information see the checkpointing API documentation in
+                :meth:`resuming`, :meth:`load_snapshot`,
+                :meth:`should_save_snapshot`, :meth:`save_snapshot`,
+                :meth:`should_save_final_snapshot` and
+                :meth:`save_final_snapshot`, or the checkpointing tutorial.
         """
         do_reuse = self.__receive_settings()
 
@@ -132,6 +153,22 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool:
                 if isinstance(message.data, ClosePort):
                     do_reuse = False
 
+        max_f_init_next_timestamp = max(
+                (msg.next_timestamp
+                 for msg in self._f_init_cache.values()
+                 if msg.next_timestamp is not None),
+                default=None)
+        # Note: muscle_snapshot_directory setting is provided by muscle_manager
+        # when checkpointing is enabled for this run. When checkpointing is not
+        # enabled, it might not exist and a KeyError is raised.
+        try:
+            snapshot_dir = self.get_setting('muscle_snapshot_directory', 'str')
+            snapshot_path = Path(cast(str, snapshot_dir))
+        except KeyError:
+            snapshot_path = None
+        self._snapshot_manager.reuse_instance(
+                max_f_init_next_timestamp, snapshot_path)
+
         if not do_reuse:
             self.__close_ports()
             self._communicator.shutdown()
@@ -381,16 +418,161 @@ def receive_with_settings(
         """
         return self.__receive_message(port_name, slot, default, True)
 
-    def _register(self) -> None:
+    def resuming(self) -> bool:
+        """Check if this instance is resuming from a snapshot.
+
+        Must be used by submodels that implement the checkpointing API. You'll
+        get a RuntimeError when not calling this method in an iteration of the
+        reuse loop.
+
+        This method returns True for the first iteration of the reuse loop after
+        resuming from a previously taken snapshot. When resuming from a
+        snapshot, the submodel must load its state from the snapshot as returned
+        by :meth:`load_snapshot` and the F_INIT step must be skipped.
+
+        Returns:
+            True iff the submodel must resume from a snapshot instead of the
+            usual F_INIT step during this iteration of the reuse loop.
+        """
+        return self._snapshot_manager.resuming()
+
+    def load_snapshot(self) -> Message:
+        """Load a snapshot.
+
+        Must only be called when :meth:`resuming` returns True.
+
+        Returns:
+            Message object containing the state as saved in a previous run
+            through :meth:`save_snapshot` or :meth:`save_final_snapshot`
+
+        Raises:
+            RuntimeError: if not resuming from a snapshot.
+        """
+        return self._snapshot_manager.load_snapshot()
+
+    def should_save_snapshot(
+            self, timestamp: float, next_timestamp: Optional[float]) -> bool:
+        """Check if a snapshot should be saved inside a time-integration loop.
+
+        This method checks if a snapshot should be saved right now, based on the
+        provided timestamps and passed wallclock time.
+
+        When the next timestamp is provided, this value will be used to
+        determine if a checkpoint will be passed between now and the next time
+        step. A submodel should always provide the next timestamp if available,
+        since this is the most reliable way to get consistent snapshots across
+        all submodels in the run.
+
+        When a submodel cannot provide the next timestamp, a best efford is made
+        to get consistent snapshots (based on the current timestamp). See the
+        checkpointing tutorial for more information.
+
+        When this method returns True, the submodel must also save a snapshot
+        through :meth:`save_snapshot`. A RuntimeError will be generated when not
+        doing so.
+
+        See also :meth:`should_save_final_snapshot` for the variant that must be
+        called at the end of a time-integration loop, or when a submodel does
+        not have a time-integration loop.
+
+        Args:
+            timestamp: current timestamp of the submodel
+            next_timestamp: timestamp of the next iteration of the time
+                integration loop of the submodel or ``None`` if not available
+
+        Returns:
+            True iff a snapshot should be taken by the submodel according to the
+            checkpoint rules provided in the ymmsl configuration.
+        """
+        return self._snapshot_manager.should_save_snapshot(
+                timestamp, next_timestamp)
+
+    def save_snapshot(self, message: Message) -> None:
+        """Save a snapshot inside a time-integration loop.
+
+        Before saving a snapshot, you should check using
+        :meth:`should_save_snapshot` if a snapshot should be saved according to
+        the checkpoint rules specified in the ymmsl configuration. You should
+        use the same timestamp and next_timestamp in the provided Message object
+        as used to query `should_save_snapshot`.
+
+        Although it is allowed to save a snapshot even when
+        :meth:`should_save_snapshot` returns False, you should avoid this: this
+        situation is not likely to lead to a consistent snapshot over all
+        submodels of the run (and therefore it is not useful to restart from).
+        It could also lead to a lot of snapshot files clogging your file system.
+
+        See also :meth:`save_final_snapshot` for the variant that must be called
+        at the end of a time-integration loop, or when a submodel does not have
+        a time-integration loop.
+
+        Args:
+            message: Message object that is saved as snapshot. The message
+                timestamp and next_timestamp attributes should be the same as
+                passed to :meth:`should_save_snapshot`. The data attribute can
+                be used to store the internal state of the submodel.
+        """
+        return self._snapshot_manager.save_snapshot(message)
+
+    def should_save_final_snapshot(self, timestamp: float) -> bool:
+        """Check if a snapshot should be saved before O_F.
+
+        This method checks if a snapshot should be saved right now, based on the
+        provided timestamp and passed wallclock time.
+
+        When this method returns True, the submodel must also save a snapshot
+        through :meth:`save_final_snapshot`. A RuntimeError will be generated
+        when not doing so.
+
+        See also :meth:`should_save_snapshot` for the variant that may be called
+        inside of a time-integration loop of the submodel.
+
+        Args:
+            timestamp: current timestamp of the submodel
+
+        Returns:
+            True iff a final snapshot should be taken by the submodel according
+            to the checkpoint rules provided in the ymmsl configuration.
+        """
+        return self._snapshot_manager.should_save_final_snapshot(timestamp)
+
+    def save_final_snapshot(self, message: Message) -> None:
+        """Save a snapshot before O_F.
+
+        Before saving a snapshot, you should check using
+        :meth:`should_save_final_snapshot` if a snapshot should be saved
+        according to the checkpoint rules specified in the ymmsl configuration.
+        You should use the same timestamp in the provided Message object as used
+        to query `should_save_final_snapshot`.
+
+        Although it is allowed to save a snapshot even when
+        :meth:`should_save_final_snapshot` returns False, you should avoid this:
+        this situation is not likely to lead to a consistent snapshot over all
+        submodels of the run (and therefore it is not useful to restart from).
+        It could also lead to a lot of snapshot files clogging your file system.
+
+        See also :meth:`save_snapshot` for the variant that may be called inside
+        of a time-integration loop of the submodel.
+
+        Args:
+            message: Message object that is saved as snapshot. The message
+                timestamp should be the same as passed to
+                :meth:`should_save_snapshot`. The data attribute can be used to
+                store the internal state of the submodel.
+        """
+        return self._snapshot_manager.save_final_snapshot(message)
+
+    def _register(self) -> Tuple[datetime, Checkpoints, Optional[Path]]:
         """Register this instance with the manager.
         """
         register_event = self._profiler.start(ProfileEventType.REGISTER)
         locations = self._communicator.get_locations()
         port_list = self.__list_declared_ports()
-        self.__manager.register_instance(self._instance_name(), locations,
-                                         port_list)
+        checkpoint_info = self.__manager.register_instance(
+                self._instance_name(), locations, port_list)
         register_event.stop()
         _logger.info('Registered with the manager')
+        return checkpoint_info
 
     def _connect(self) -> None:
         """Connect this instance to the given peers / conduits.
diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py
index 6771b1dd..de6a6897 100644
--- a/libmuscle/python/libmuscle/mmp_client.py
+++ b/libmuscle/python/libmuscle/mmp_client.py
@@ -1,9 +1,11 @@
+from datetime import datetime
+from pathlib import Path
 from random import uniform
 from time import perf_counter, sleep
-from typing import Any, Dict, Iterable, List, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Tuple
 
 import msgpack
-from ymmsl import Conduit, Operator, Port, Reference, Settings
+from ymmsl import Conduit, Operator, Port, Reference, Settings, Checkpoints
 
 from libmuscle.mcp.protocol import RequestType, ResponseType
 from libmuscle.mcp.tcp_transport_client import TcpTransportClient
@@ -109,7 +111,8 @@ def get_settings(self) -> Settings:
         return Settings(response[1])
 
     def register_instance(self, name: Reference, locations: List[str],
-                          ports: List[Port]) -> None:
+                          ports: List[Port]
+                          ) -> Tuple[datetime, Checkpoints, Optional[Path]]:
         """Register a component instance with the manager.
 
         Args:
@@ -126,6 +129,8 @@ def register_instance(self, name: Reference, locations: List[str],
         if len(response) > 1:
             raise RuntimeError(
                     f'Error registering instance: {response[1]}')
+        # TODO
+        return (datetime.now(), Checkpoints(), None)
 
     def request_peers(
             self, name: Reference) -> Tuple[
diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py
index f8477637..cd6f9959 100644
--- a/libmuscle/python/libmuscle/snapshot_manager.py
+++ b/libmuscle/python/libmuscle/snapshot_manager.py
@@ -46,10 +46,10 @@ def __init__(self,
         self._snapshot_directory = None     # type: Optional[Path]
         self._next_snapshot_num = 1
 
-    def registered(self,
-                   utc_reference: datetime,
-                   checkpoints: Checkpoints,
-                   resume: Optional[Path]) -> None:
+    def set_checkpoint_info(self,
+                            utc_reference: datetime,
+                            checkpoints: Checkpoints,
+                            resume: Optional[Path]) -> None:
         """Callback after registering with the manager.
 
         Provide the snapshot manager with info on workflow checkpoints and if we
@@ -75,7 +75,7 @@ def registered(self,
 
     def reuse_instance(self,
                        max_f_init_next_timestamp: Optional[float],
-                       snapshot_directory: Path,
+                       snapshot_directory: Optional[Path],
                        ) -> None:
         """Callback on Instance.reuse_instance
 
diff --git a/libmuscle/python/libmuscle/test/test_instance.py b/libmuscle/python/libmuscle/test/test_instance.py
index 99d8d37b..e8c7f9b0 100644
--- a/libmuscle/python/libmuscle/test/test_instance.py
+++ b/libmuscle/python/libmuscle/test/test_instance.py
@@ -1,9 +1,10 @@
+from datetime import datetime, timezone
 import sys
 from typing import Generator
 from unittest.mock import MagicMock, patch
 
 import pytest
-from ymmsl import Operator, Reference, Settings
+from ymmsl import Operator, Reference, Settings, Checkpoints
 
 from libmuscle.communicator import Message
 from libmuscle.instance import Instance
@@ -48,6 +49,8 @@ def instance(sys_argv_instance):
 
         mmp_client_object = MagicMock()
         mmp_client_object.request_peers.return_value = (None, None, None)
+        checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None)
+        mmp_client_object.register_instance.return_value = checkpoint_info
         mmp_client.return_value = mmp_client_object
 
         instance = Instance({
@@ -64,6 +67,8 @@ def instance2(sys_argv_instance):
          patch('libmuscle.instance.Communicator'):
         mmp_client_object = MagicMock()
         mmp_client_object.request_peers.return_value = (None, None, None)
+        checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None)
+        mmp_client_object.register_instance.return_value = checkpoint_info
         mmp_client.return_value = mmp_client_object
         instance = Instance({
             Operator.F_INIT: ['in[]'],
@@ -77,6 +82,8 @@ def test_create_instance(
          patch('libmuscle.instance.Communicator') as comm_type:
         mmp_client_object = MagicMock()
         mmp_client_object.request_peers.return_value = (None, None, None)
+        checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None)
+        mmp_client_object.register_instance.return_value = checkpoint_info
         mmp_client.return_value = mmp_client_object
         ports = {
             Operator.F_INIT: ['in'],
diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
index a8223e53..8b86ff7a 100644
--- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py
+++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
@@ -18,7 +18,8 @@ def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path
     communicator.get_message_counts.return_value = {}
     snapshot_manager = SnapshotManager(Reference('test'), manager, communicator)
 
-    snapshot_manager.registered(datetime.now(timezone.utc), Checkpoints(), None)
+    snapshot_manager.set_checkpoint_info(
+            datetime.now(timezone.utc), Checkpoints(), None)
 
     snapshot_manager.reuse_instance(None, Path(tmp_path))
     assert not snapshot_manager.resuming()
@@ -42,7 +43,8 @@ def test_save_load_checkpoint(tmp_path: Path) -> None:
     snapshot_manager = SnapshotManager(instance_id, manager, communicator)
 
     checkpoints = Checkpoints(simulation_time=CheckpointRules(every=1))
-    snapshot_manager.registered(datetime.now(timezone.utc), checkpoints, None)
+    snapshot_manager.set_checkpoint_info(
+            datetime.now(timezone.utc), checkpoints, None)
 
     snapshot_manager.reuse_instance(None, tmp_path)
     with pytest.raises(RuntimeError):
@@ -68,7 +70,8 @@ def test_save_load_checkpoint(tmp_path: Path) -> None:
 
     snapshot_manager2 = SnapshotManager(instance_id, manager, communicator)
 
-    snapshot_manager2.registered(datetime.now(timezone.utc), checkpoints, fpath)
+    snapshot_manager2.set_checkpoint_info(
+            datetime.now(timezone.utc), checkpoints, fpath)
     communicator.restore_message_counts.assert_called_with(port_message_counts)
 
     assert snapshot_manager2.resuming()

From ee12e5c16b67c75407a409f4055200470dcef3d0 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 1 Sep 2022 15:52:03 +0200
Subject: [PATCH 033/183] Refactoring due to ymmsl update

See also
https://github.com/multiscale/ymmsl-python/commit/8e6e7631c6b7f9eab26c3c730f68bb83b7752332
---
 .../python/libmuscle/checkpoint_triggers.py   | 48 ++++++++++---------
 .../test/test_checkpoint_triggers.py          | 24 +++++-----
 .../libmuscle/test/test_snapshot_manager.py   |  4 +-
 3 files changed, 40 insertions(+), 36 deletions(-)

diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py
index dbb8fcb4..6e4d644e 100644
--- a/libmuscle/python/libmuscle/checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/checkpoint_triggers.py
@@ -5,7 +5,8 @@
 import time
 from typing import List, Optional, Union
 
-from ymmsl import CheckpointRange, CheckpointRules, Checkpoints
+from ymmsl import (
+        CheckpointRangeRule, CheckpointAtRule, CheckpointRule, Checkpoints)
 
 
 _logger = logging.getLogger(__name__)
@@ -52,14 +53,16 @@ class AtCheckpointTrigger(CheckpointTrigger):
     This triggers at the specified times.
     """
 
-    def __init__(self, at: List[Union[float, int]]) -> None:
+    def __init__(self, at_rules: List[CheckpointAtRule]) -> None:
         """Create an "at" checkpoint trigger
 
         Args:
             at: list of checkpoint moments
         """
-        self._at = at
-        self._at.sort()  # ymmsl already sorts, but just to be sure
+        self._at = []
+        for at_rule in at_rules:
+            self._at.extend(at_rule.at)
+        self._at.sort()
 
     def next_checkpoint(self, cur_time: float) -> Optional[float]:
         if cur_time >= self._at[-1]:
@@ -92,7 +95,7 @@ class RangeCheckpointTrigger(CheckpointTrigger):
     omitted, and is handled by this class as well
     """
 
-    def __init__(self, range: CheckpointRange) -> None:
+    def __init__(self, range: CheckpointRangeRule) -> None:
         """Create a range of checkpoints
 
         Args:
@@ -100,12 +103,12 @@ def __init__(self, range: CheckpointRange) -> None:
         """
         self._start = range.start
         self._stop = range.stop
-        self._step = range.step
+        self._every = range.every
         self._last = None  # type: Union[int, float, None]
         if self._stop is not None:
             start = 0 if self._start is None else self._start
             diff = self._stop - start
-            self._last = start + (diff // self._step) * self._step
+            self._last = start + (diff // self._every) * self._every
 
     def next_checkpoint(self, cur_time: float) -> Optional[float]:
         if self._start is not None and cur_time < self._start:
@@ -114,7 +117,7 @@ def next_checkpoint(self, cur_time: float) -> Optional[float]:
             return None
         start = 0 if self._start is None else self._start
         diff = cur_time - start
-        return float(start + (diff // self._step + 1) * self._step)
+        return float(start + (diff // self._every + 1) * self._every)
 
     def previous_checkpoint(self, cur_time: float) -> Optional[float]:
         if self._start is not None and cur_time < self._start:
@@ -123,30 +126,31 @@ def previous_checkpoint(self, cur_time: float) -> Optional[float]:
             return float(self._last)
         start = 0 if self._start is None else self._start
         diff = cur_time - start
-        return float(start + (diff // self._step) * self._step)
+        return float(start + (diff // self._every) * self._every)
 
 
 class CombinedCheckpointTriggers(CheckpointTrigger):
     """Checkpoint trigger based on a combination of "every", "at" and "ranges"
     """
 
-    def __init__(self, checkpoint_rules: Optional[CheckpointRules]) -> None:
+    def __init__(self, checkpoint_rules: List[CheckpointRule]) -> None:
         """Create a new combined checkpoint trigger from the given rules
 
         Args:
-            checkpoint_rules: checkpoint rules (from ymmsl) defining "every",
-                "at", and/or "ranges" rules
+            checkpoint_rules: checkpoint rules (from ymmsl)
         """
-        self._triggers = []  # type: List[CheckpointTrigger]
-        if checkpoint_rules is None:
-            return
-        if checkpoint_rules.every is not None:
-            cp_range = CheckpointRange(step=checkpoint_rules.every)
-            self._triggers.append(RangeCheckpointTrigger(cp_range))
-        if checkpoint_rules.at:
-            self._triggers.append(AtCheckpointTrigger(checkpoint_rules.at))
-        for cp_range in checkpoint_rules.ranges:
-            self._triggers.append(RangeCheckpointTrigger(cp_range))
+        self._triggers = []     # type: List[CheckpointTrigger]
+        at_rules = []           # type: List[CheckpointAtRule]
+        for rule in checkpoint_rules:
+            if isinstance(rule, CheckpointAtRule):
+                if rule.at:
+                    at_rules.append(rule)
+            elif isinstance(rule, CheckpointRangeRule):
+                self._triggers.append(RangeCheckpointTrigger(rule))
+            else:
+                raise RuntimeError('Unknown checkpoint rule')
+        if at_rules:
+            self._triggers.append(AtCheckpointTrigger(at_rules))
 
     def next_checkpoint(self, cur_time: float) -> Optional[float]:
         checkpoints = (trigger.next_checkpoint(cur_time)
diff --git a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
index 41577ad3..baf0c2c1 100644
--- a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
@@ -2,7 +2,7 @@
 import logging
 import time
 import pytest
-from ymmsl import CheckpointRange, CheckpointRules, Checkpoints
+from ymmsl import CheckpointRangeRule, CheckpointAtRule, Checkpoints
 
 from libmuscle.checkpoint_triggers import (
     CombinedCheckpointTriggers, AtCheckpointTrigger, RangeCheckpointTrigger,
@@ -10,7 +10,7 @@
 
 
 def test_at_checkpoint_trigger():
-    trigger = AtCheckpointTrigger([1, 3, 4, 4.5, 9])
+    trigger = AtCheckpointTrigger([CheckpointAtRule([1, 3, 4, 4.5, 9])])
 
     assert trigger.next_checkpoint(0) == 1
     assert trigger.previous_checkpoint(0) is None
@@ -39,7 +39,7 @@ def test_at_checkpoint_trigger():
 
 
 def test_range_checkpoint_trigger():
-    range = CheckpointRange(start=0, stop=20, step=1.2)
+    range = CheckpointRangeRule(start=0, stop=20, every=1.2)
     trigger = RangeCheckpointTrigger(range)
 
     assert trigger.next_checkpoint(-1) == 0
@@ -59,7 +59,7 @@ def test_range_checkpoint_trigger():
 
 
 def test_range_checkpoint_trigger_default_stop():
-    range = CheckpointRange(start=1, step=1.2)
+    range = CheckpointRangeRule(start=1, every=1.2)
     trigger = RangeCheckpointTrigger(range)
 
     assert trigger.next_checkpoint(-1.) == 1
@@ -73,7 +73,7 @@ def test_range_checkpoint_trigger_default_stop():
 
 
 def test_range_checkpoint_trigger_default_start():
-    range = CheckpointRange(step=1.2, stop=10)
+    range = CheckpointRangeRule(every=1.2, stop=10)
     trigger = RangeCheckpointTrigger(range)
 
     assert trigger.next_checkpoint(10) is None
@@ -87,7 +87,7 @@ def test_range_checkpoint_trigger_default_start():
 
 
 def test_combined_checkpoint_trigger_every_at():
-    rules = CheckpointRules(every=10, at=[3, 7, 13, 17])
+    rules = [CheckpointRangeRule(every=10), CheckpointAtRule([3, 7, 13, 17])]
     trigger = CombinedCheckpointTriggers(rules)
 
     assert trigger.next_checkpoint(-11.) == pytest.approx(-10)
@@ -107,9 +107,9 @@ def test_combined_checkpoint_trigger_every_at():
 
 
 def test_combined_checkpoint_trigger_at_ranges():
-    rules = CheckpointRules(at=[3, 7, 13, 17], ranges=[
-                    CheckpointRange(start=0, step=5, stop=20),
-                    CheckpointRange(start=20, step=20, stop=100)])
+    rules = [CheckpointAtRule([3, 7, 13, 17]),
+             CheckpointRangeRule(start=0, every=5, stop=20),
+             CheckpointRangeRule(start=20, every=20, stop=100)]
     trigger = CombinedCheckpointTriggers(rules)
 
     assert trigger.next_checkpoint(-11.) == pytest.approx(0)
@@ -150,8 +150,8 @@ def test_trigger_manager_reference_time():
 def test_trigger_manager():
     reference = datetime.now(timezone.utc)
     trigger_manager = TriggerManager(reference, Checkpoints(
-            wallclock_time=CheckpointRules(at=[1e-12]),
-            simulation_time=CheckpointRules(at=[1, 3, 5])))
+            wallclock_time=[CheckpointAtRule([1e-12])],
+            simulation_time=[CheckpointAtRule([1, 3, 5])]))
 
     trigger_manager.reuse_instance(7)
 
@@ -202,7 +202,7 @@ def test_trigger_manager_warnings(caplog: pytest.LogCaptureFixture,
 
     reference = datetime.now(timezone.utc)
     trigger_manager = TriggerManager(reference, Checkpoints(
-            simulation_time=CheckpointRules(at=[1, 3, 5])))
+            simulation_time=[CheckpointAtRule([1, 3, 5])]))
 
     trigger_manager.reuse_instance(2)
 
diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
index 8b86ff7a..d7d386c9 100644
--- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py
+++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
@@ -4,7 +4,7 @@
 from unittest.mock import MagicMock
 
 import pytest
-from ymmsl import Reference, Checkpoints, CheckpointRules
+from ymmsl import Reference, Checkpoints, CheckpointRangeRule
 
 from libmuscle.communicator import Message
 from libmuscle.snapshot import SnapshotMetadata
@@ -42,7 +42,7 @@ def test_save_load_checkpoint(tmp_path: Path) -> None:
     instance_id = Reference('test[1]')
     snapshot_manager = SnapshotManager(instance_id, manager, communicator)
 
-    checkpoints = Checkpoints(simulation_time=CheckpointRules(every=1))
+    checkpoints = Checkpoints(simulation_time=[CheckpointRangeRule(every=1)])
     snapshot_manager.set_checkpoint_info(
             datetime.now(timezone.utc), checkpoints, None)
 

From 2ab52946f994ee92c3b97e9f41246a98adb8bc2d Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 1 Sep 2022 17:31:11 +0200
Subject: [PATCH 034/183] Add checkpoint info in register_instance response

---
 libmuscle/cpp/src/libmuscle/mmp_client.cpp    |  2 +-
 libmuscle/python/libmuscle/manager/manager.py |  2 +-
 .../python/libmuscle/manager/mmp_server.py    | 68 ++++++++++++++++---
 .../python/libmuscle/manager/test/conftest.py | 20 +++---
 .../manager/test/test_mmp_request_handler.py  | 21 +++---
 libmuscle/python/libmuscle/mmp_client.py      | 50 ++++++++++++--
 .../python/libmuscle/test/test_mmp_client.py  |  6 +-
 7 files changed, 132 insertions(+), 37 deletions(-)

diff --git a/libmuscle/cpp/src/libmuscle/mmp_client.cpp b/libmuscle/cpp/src/libmuscle/mmp_client.cpp
index acd6672d..de50e894 100644
--- a/libmuscle/cpp/src/libmuscle/mmp_client.cpp
+++ b/libmuscle/cpp/src/libmuscle/mmp_client.cpp
@@ -105,7 +105,7 @@ void MMPClient::register_instance(
 
     auto response = call_manager_(request);
 
-    if (response.size() > 1)
+    if (response[0].as<int>() == static_cast<int>(ResponseType::error))
         throw std::runtime_error(
                 "Error registering instance: " + response[1].as<std::string>());
 }
diff --git a/libmuscle/python/libmuscle/manager/manager.py b/libmuscle/python/libmuscle/manager/manager.py
index c8aa76d4..21f21c60 100644
--- a/libmuscle/python/libmuscle/manager/manager.py
+++ b/libmuscle/python/libmuscle/manager/manager.py
@@ -58,7 +58,7 @@ def __init__(
             pass
 
         self._server = MMPServer(
-                self._logger, self._configuration.settings,
+                self._logger, self._configuration,
                 self._instance_registry, self._topology_store)
 
         if self._instance_manager:
diff --git a/libmuscle/python/libmuscle/manager/mmp_server.py b/libmuscle/python/libmuscle/manager/mmp_server.py
index 0f66690a..04a99680 100644
--- a/libmuscle/python/libmuscle/manager/mmp_server.py
+++ b/libmuscle/python/libmuscle/manager/mmp_server.py
@@ -1,9 +1,12 @@
+from datetime import datetime, timezone
 import errno
 import logging
-from typing import Any, cast, Generator, List
+from typing import Any, Dict, Optional, Tuple, cast, Generator, List
 
 import msgpack
-from ymmsl import Conduit, Identifier, Operator, Port, Reference, Settings
+from ymmsl import (
+        Conduit, Identifier, Operator, Port, Reference, PartialConfiguration,
+        Checkpoints)
 
 from libmuscle.logging import LogLevel
 from libmuscle.manager.instance_registry import (
@@ -19,6 +22,8 @@
 
 _logger = logging.getLogger(__name__)
 
+_EncodedCheckpointType = Dict[str, List[Dict[str, Any]]]
+
 
 def decode_operator(data: str) -> Operator:
     """Create an Operator from a MsgPack-compatible value."""
@@ -35,12 +40,20 @@ def encode_conduit(conduit: Conduit) -> List[str]:
     return [str(conduit.sender), str(conduit.receiver)]
 
 
+def encode_checkpoints(checkpoints: Checkpoints) -> _EncodedCheckpointType:
+    """Convert a Checkpoins to a MsgPack-compatible value."""
+    return {
+        "wallclock_time": [vars(rule) for rule in checkpoints.wallclock_time],
+        "simulation_time": [vars(rule) for rule in checkpoints.simulation_time]
+    }
+
+
 class MMPRequestHandler(RequestHandler):
     """Handles Manager requests."""
     def __init__(
             self,
             logger: Logger,
-            settings: Settings,
+            configuration: PartialConfiguration,
             instance_registry: InstanceRegistry,
             topology_store: TopologyStore):
         """Create an MMPRequestHandler.
@@ -52,9 +65,10 @@ def __init__(
             topology_store: Keeps track of how to connect things.
         """
         self._logger = logger
-        self._settings = settings
+        self._configuration = configuration
         self._instance_registry = instance_registry
         self._topology_store = topology_store
+        self._reference_time = datetime.now(timezone.utc)
 
     def handle_request(self, request: bytes) -> bytes:
         """Handles a manager request.
@@ -98,14 +112,22 @@ def _register_instance(
             status (ResponseType): SUCCESS or ERROR
             error_msg (str): An error message, only present if status
                 equals ERROR
+            checkpoint_info (Tuple[str, bytes, Optional[str]]): Checkpoint info,
+                only present if status equals SUCCESS. The first item is an
+                ISO8601 encoding of the wallclock reference time (see
+                :meth:`datetime.datetime.isoformat`). The second item is a
+                yaml-encoded ymmsl.Checkpoints object. The final item is the
+                checkpoint filename that the registered instance should resume
+                from, or None if no resume is requested.
         """
         port_objs = [decode_port(p) for p in ports]
+        instance = Reference(instance_id)
         try:
-            self._instance_registry.add(
-                Reference(instance_id), locations, port_objs)
+            self._instance_registry.add(instance, locations, port_objs)
 
             _logger.info(f'Registered instance {instance_id}')
-            return [ResponseType.SUCCESS.value]
+            checkpoint_info = self._get_checkpoint_info(instance)
+            return [ResponseType.SUCCESS.value, checkpoint_info]
         except AlreadyRegistered:
             return [
                     ResponseType.ERROR.value,
@@ -202,7 +224,7 @@ def _get_settings(self) -> Any:
         """
         return [
                 ResponseType.SUCCESS.value,
-                self._settings.as_ordered_dict()]
+                self._configuration.settings.as_ordered_dict()]
 
     def _submit_log_message(
             self, instance_id: str, timestamp: float, level: int, text: str
@@ -261,6 +283,29 @@ def _generate_peer_instances(
                 for peer_indices in generate_indices(peer_dims[len(dims):]):
                     yield base + peer_indices
 
+    def _get_checkpoint_info(
+                self,
+                instance: Reference
+                ) -> Tuple[str, _EncodedCheckpointType, Optional[str]]:
+        """Get checkpoint info for an instance
+
+        Args:
+            instance: The instance whose checkpoint info to get
+
+        Returns:
+            wallclock_reference_time: :meth:`datetime.datetime.isoformat`
+                encoded UTC reference for wallclock time = 0
+            checkpoints: yaml-encoded ymmsl.Checkpoints object
+            resume: path of the snapshot file to resume from (or None if not
+                resuming)
+        """
+        resume = None
+        if instance in self._configuration.resume:
+            resume = str(self._configuration.resume[instance])
+        return (self._reference_time.isoformat(),
+                encode_checkpoints(self._configuration.checkpoints),
+                resume)
+
 
 class MMPServer:
     """The MUSCLE Manager Protocol server.
@@ -272,7 +317,7 @@ class MMPServer:
     def __init__(
             self,
             logger: Logger,
-            settings: Settings,
+            configuration: PartialConfiguration,
             instance_registry: InstanceRegistry,
             topology_store: TopologyStore
             ) -> None:
@@ -285,13 +330,14 @@ def __init__(
 
         Args:
             logger: Logger to send log messages to
-            settings: Settings component to get settings from
+            configuration: Configuration component to get settings, checkpoints
+                and resumes from
             instance_registry: To register instances with and get
                 peer locations from
             topology_store: To get peers and conduits from
         """
         self._handler = MMPRequestHandler(
-                logger, settings, instance_registry, topology_store)
+                logger, configuration, instance_registry, topology_store)
         try:
             self._server = TcpTransportServer(self._handler, 9000)
         except OSError as e:
diff --git a/libmuscle/python/libmuscle/manager/test/conftest.py b/libmuscle/python/libmuscle/manager/test/conftest.py
index 9ba095dd..433e23b1 100644
--- a/libmuscle/python/libmuscle/manager/test/conftest.py
+++ b/libmuscle/python/libmuscle/manager/test/conftest.py
@@ -2,7 +2,7 @@
 
 import pytest
 from ymmsl import (Component, Conduit, Configuration, Model, Reference,
-                   Settings)
+                   PartialConfiguration)
 
 from libmuscle.manager.instance_registry import InstanceRegistry
 from libmuscle.manager.logger import Logger
@@ -18,8 +18,8 @@ def logger(tmpdir):
 
 
 @pytest.fixture
-def settings():
-    return Settings()
+def mmp_configuration():
+    return PartialConfiguration()
 
 
 @pytest.fixture
@@ -45,9 +45,10 @@ def topology_store() -> TopologyStore:
 
 
 @pytest.fixture
-def mmp_request_handler(logger, settings, instance_registry, topology_store):
+def mmp_request_handler(
+        logger, mmp_configuration, instance_registry, topology_store):
     return MMPRequestHandler(
-            logger, settings, instance_registry, topology_store)
+            logger, mmp_configuration, instance_registry, topology_store)
 
 
 @pytest.fixture
@@ -63,9 +64,9 @@ def loaded_instance_registry(instance_registry):
 
 @pytest.fixture
 def registered_mmp_request_handler(
-        logger, settings, loaded_instance_registry, topology_store):
+        logger, mmp_configuration, loaded_instance_registry, topology_store):
     return MMPRequestHandler(
-            logger, settings, loaded_instance_registry, topology_store)
+            logger, mmp_configuration, loaded_instance_registry, topology_store)
 
 
 @pytest.fixture
@@ -109,6 +110,7 @@ def loaded_instance_registry2():
 
 @pytest.fixture
 def registered_mmp_request_handler2(
-        logger, settings, loaded_instance_registry2, topology_store2):
+        logger, mmp_configuration, loaded_instance_registry2, topology_store2):
     return MMPRequestHandler(
-            logger, settings, loaded_instance_registry2, topology_store2)
+            logger, mmp_configuration,
+            loaded_instance_registry2, topology_store2)
diff --git a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
index 6c40e02e..733baa61 100644
--- a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
+++ b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
@@ -6,9 +6,10 @@
 from libmuscle.mcp.protocol import RequestType, ResponseType
 
 
-def test_create_servicer(logger, settings, instance_registry,
+def test_create_servicer(logger, mmp_configuration, instance_registry,
                          topology_store):
-    MMPRequestHandler(logger, settings, instance_registry, topology_store)
+    MMPRequestHandler(
+            logger, mmp_configuration, instance_registry, topology_store)
 
 
 def test_log_message(mmp_request_handler, caplog):
@@ -31,7 +32,7 @@ def test_log_message(mmp_request_handler, caplog):
     assert caplog.records[0].message == 'Testing log message'
 
 
-def test_get_settings(settings, mmp_request_handler):
+def test_get_settings(mmp_configuration, mmp_request_handler):
     request = [RequestType.GET_SETTINGS.value]
     encoded_request = msgpack.packb(request, use_bin_type=True)
 
@@ -42,12 +43,12 @@ def test_get_settings(settings, mmp_request_handler):
     assert decoded_result[0] == ResponseType.SUCCESS.value
     assert decoded_result[1] == {}
 
-    settings['test1'] = 13
-    settings['test2'] = 12.3
-    settings['test3'] = 'testing'
-    settings['test4'] = True
-    settings['test5'] = [2.3, 7.4]
-    settings['test6'] = [[1.0, 2.0], [2.0, 1.0]]
+    mmp_configuration.settings['test1'] = 13
+    mmp_configuration.settings['test2'] = 12.3
+    mmp_configuration.settings['test3'] = 'testing'
+    mmp_configuration.settings['test4'] = True
+    mmp_configuration.settings['test5'] = [2.3, 7.4]
+    mmp_configuration.settings['test6'] = [[1.0, 2.0], [2.0, 1.0]]
 
     result = mmp_request_handler.handle_request(encoded_request)
     decoded_result = msgpack.unpackb(result, raw=False)
@@ -63,7 +64,7 @@ def test_get_settings(settings, mmp_request_handler):
     assert result_dict['test4'] is True
     assert result_dict['test5'] == [2.3, 7.4]
     assert result_dict['test6'] == [[1.0, 2.0], [2.0, 1.0]]
-    assert result_dict == settings.as_ordered_dict()
+    assert result_dict == mmp_configuration.settings.as_ordered_dict()
 
 
 def test_register_instance(mmp_request_handler, instance_registry):
diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py
index de6a6897..685ae888 100644
--- a/libmuscle/python/libmuscle/mmp_client.py
+++ b/libmuscle/python/libmuscle/mmp_client.py
@@ -5,7 +5,9 @@
 from typing import Any, Dict, Iterable, List, Optional, Tuple
 
 import msgpack
-from ymmsl import Conduit, Operator, Port, Reference, Settings, Checkpoints
+from ymmsl import (
+        Conduit, Operator, Port, Reference, Settings, Checkpoints,
+        CheckpointRule, CheckpointRangeRule, CheckpointAtRule)
 
 from libmuscle.mcp.protocol import RequestType, ResponseType
 from libmuscle.mcp.tcp_transport_client import TcpTransportClient
@@ -48,6 +50,42 @@ def encode_profile_event(event: ProfileEvent) -> Any:
             event.message_size]
 
 
+def decode_checkpoint_rule(rule: Dict[str, Any]) -> CheckpointRule:
+    """Decode a checkpoint rule from a MsgPack-compatible value."""
+    if rule.keys() == {'in'}:
+        return CheckpointAtRule(**rule)
+    if rule.keys() == {'start', 'stop', 'every'}:
+        return CheckpointRangeRule(**rule)
+    raise ValueError('Cannot convert {rule} to a checkpoint rule.')
+
+
+def decode_checkpoint_info(
+        iso_walltime_reference: str,
+        checkpoints_dict: Dict[str, List[Dict[str, Any]]],
+        resume: Optional[str]
+        ) -> Tuple[datetime, Checkpoints, Optional[Path]]:
+    """Decode checkpoint info from a MsgPack-compatible value.
+
+    Args:
+        iso_walltime_reference: iso8601 string generated by datetime.isoformat
+        checkpoints_dict: dictionary of checkpoint definitions
+        resume: optional string indicating resume path
+
+    Returns:
+        wallclock_time_reference: UTC time where wallclock_time = 0
+        checkpoints: checkpoint configuration
+        resume: path to the resume snapshot
+    """
+    wallclock_time_reference = datetime.fromisoformat(iso_walltime_reference)
+    checkpoints = Checkpoints(
+            wallclock_time=[decode_checkpoint_rule(rule)
+                            for rule in checkpoints_dict["wallclock_time"]],
+            simulation_time=[decode_checkpoint_rule(rule)
+                             for rule in checkpoints_dict["simulation_time"]])
+    resume_path = None if resume is None else Path(resume)
+    return (wallclock_time_reference, checkpoints, resume_path)
+
+
 class MMPClient():
     """The client for the MUSCLE Manager Protocol.
 
@@ -120,17 +158,21 @@ def register_instance(self, name: Reference, locations: List[str],
             locations: List of places where the instance can be
                     reached.
             ports: List of ports of this instance.
+
+        Returns:
+            wallclock_time_reference: UTC time where wallclock_time = 0
+            checkpoints: checkpoint configuration
+            resume: path to the resume snapshot
         """
         request = [
                 RequestType.REGISTER_INSTANCE.value,
                 str(name), locations,
                 [encode_port(p) for p in ports]]
         response = self._call_manager(request)
-        if len(response) > 1:
+        if response[0] == ResponseType.ERROR.value:
             raise RuntimeError(
                     f'Error registering instance: {response[1]}')
-        # TODO
-        return (datetime.now(), Checkpoints(), None)
+        return decode_checkpoint_info(*response[1])
 
     def request_peers(
             self, name: Reference) -> Tuple[
diff --git a/libmuscle/python/libmuscle/test/test_mmp_client.py b/libmuscle/python/libmuscle/test/test_mmp_client.py
index d5051962..be098d61 100644
--- a/libmuscle/python/libmuscle/test/test_mmp_client.py
+++ b/libmuscle/python/libmuscle/test/test_mmp_client.py
@@ -1,3 +1,4 @@
+from datetime import datetime, timezone
 from unittest.mock import patch
 
 import msgpack
@@ -73,7 +74,10 @@ def test_get_settings(mocked_mmp_client) -> None:
 def test_register_instance(mocked_mmp_client) -> None:
     client, stub = mocked_mmp_client
 
-    result = [ResponseType.SUCCESS.value]
+    result = [ResponseType.SUCCESS.value,
+              (datetime.now(timezone.utc).isoformat(),
+               {'wallclock_time': [], 'simulation_time': []},
+               None)]
     stub.call.return_value = msgpack.packb(result, use_bin_type=True)
 
     client.register_instance(

From ab9fbd5aa3b3c9a6a0e86134f3713b35903436e6 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 2 Sep 2022 10:09:15 +0200
Subject: [PATCH 035/183] Send reference time as tuple instead of ISO string

---
 .../python/libmuscle/manager/mmp_server.py    | 24 ++++++++++++-------
 libmuscle/python/libmuscle/mmp_client.py      | 11 +++++----
 .../python/libmuscle/test/test_mmp_client.py  |  6 ++++-
 3 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/libmuscle/python/libmuscle/manager/mmp_server.py b/libmuscle/python/libmuscle/manager/mmp_server.py
index 04a99680..b237f316 100644
--- a/libmuscle/python/libmuscle/manager/mmp_server.py
+++ b/libmuscle/python/libmuscle/manager/mmp_server.py
@@ -22,6 +22,7 @@
 
 _logger = logging.getLogger(__name__)
 
+_EncodedTimeType = Tuple[int, int, int, int, int, int, int]
 _EncodedCheckpointType = Dict[str, List[Dict[str, Any]]]
 
 
@@ -68,7 +69,10 @@ def __init__(
         self._configuration = configuration
         self._instance_registry = instance_registry
         self._topology_store = topology_store
-        self._reference_time = datetime.now(timezone.utc)
+        reftime = datetime.now(timezone.utc)
+        self._reference_time_tuple = (reftime.year, reftime.month, reftime.day,
+                                      reftime.hour, reftime.minute,
+                                      reftime.second, reftime.microsecond)
 
     def handle_request(self, request: bytes) -> bytes:
         """Handles a manager request.
@@ -113,10 +117,10 @@ def _register_instance(
             error_msg (str): An error message, only present if status
                 equals ERROR
             checkpoint_info (Tuple[str, bytes, Optional[str]]): Checkpoint info,
-                only present if status equals SUCCESS. The first item is an
-                ISO8601 encoding of the wallclock reference time (see
-                :meth:`datetime.datetime.isoformat`). The second item is a
-                yaml-encoded ymmsl.Checkpoints object. The final item is the
+                only present if status equals SUCCESS. The first item is a tuple
+                encoding of the wallclock reference time (year, month, day,
+                hour, minute, second, microsecond) in UTC. The second item is a
+                dict encoding a ymmsl.Checkpoints object. The final item is the
                 checkpoint filename that the registered instance should resume
                 from, or None if no resume is requested.
         """
@@ -286,15 +290,17 @@ def _generate_peer_instances(
     def _get_checkpoint_info(
                 self,
                 instance: Reference
-                ) -> Tuple[str, _EncodedCheckpointType, Optional[str]]:
+                ) -> Tuple[_EncodedTimeType,
+                           _EncodedCheckpointType,
+                           Optional[str]]:
         """Get checkpoint info for an instance
 
         Args:
             instance: The instance whose checkpoint info to get
 
         Returns:
-            wallclock_reference_time: :meth:`datetime.datetime.isoformat`
-                encoded UTC reference for wallclock time = 0
+            wallclock_reference_time: tuple encoding UTC reference for wallclock
+                time = 0: (year, month, day, hour, minute, second, microsecond)
             checkpoints: yaml-encoded ymmsl.Checkpoints object
             resume: path of the snapshot file to resume from (or None if not
                 resuming)
@@ -302,7 +308,7 @@ def _get_checkpoint_info(
         resume = None
         if instance in self._configuration.resume:
             resume = str(self._configuration.resume[instance])
-        return (self._reference_time.isoformat(),
+        return (self._reference_time_tuple,
                 encode_checkpoints(self._configuration.checkpoints),
                 resume)
 
diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py
index 685ae888..f8f05330 100644
--- a/libmuscle/python/libmuscle/mmp_client.py
+++ b/libmuscle/python/libmuscle/mmp_client.py
@@ -1,4 +1,4 @@
-from datetime import datetime
+from datetime import datetime, timezone
 from pathlib import Path
 from random import uniform
 from time import perf_counter, sleep
@@ -60,14 +60,15 @@ def decode_checkpoint_rule(rule: Dict[str, Any]) -> CheckpointRule:
 
 
 def decode_checkpoint_info(
-        iso_walltime_reference: str,
+        utc_walltime_reference: Tuple[int, int, int, int, int, int, int],
         checkpoints_dict: Dict[str, List[Dict[str, Any]]],
         resume: Optional[str]
         ) -> Tuple[datetime, Checkpoints, Optional[Path]]:
     """Decode checkpoint info from a MsgPack-compatible value.
 
     Args:
-        iso_walltime_reference: iso8601 string generated by datetime.isoformat
+        utc_walltime_reference: tuple (year, month, day, hour, minute, second,
+            microsecond) in UTC timezone
         checkpoints_dict: dictionary of checkpoint definitions
         resume: optional string indicating resume path
 
@@ -76,14 +77,14 @@ def decode_checkpoint_info(
         checkpoints: checkpoint configuration
         resume: path to the resume snapshot
     """
-    wallclock_time_reference = datetime.fromisoformat(iso_walltime_reference)
+    ref_time = datetime(*utc_walltime_reference, tzinfo=timezone.utc)
     checkpoints = Checkpoints(
             wallclock_time=[decode_checkpoint_rule(rule)
                             for rule in checkpoints_dict["wallclock_time"]],
             simulation_time=[decode_checkpoint_rule(rule)
                              for rule in checkpoints_dict["simulation_time"]])
     resume_path = None if resume is None else Path(resume)
-    return (wallclock_time_reference, checkpoints, resume_path)
+    return (ref_time, checkpoints, resume_path)
 
 
 class MMPClient():
diff --git a/libmuscle/python/libmuscle/test/test_mmp_client.py b/libmuscle/python/libmuscle/test/test_mmp_client.py
index be098d61..e93d6014 100644
--- a/libmuscle/python/libmuscle/test/test_mmp_client.py
+++ b/libmuscle/python/libmuscle/test/test_mmp_client.py
@@ -74,8 +74,12 @@ def test_get_settings(mocked_mmp_client) -> None:
 def test_register_instance(mocked_mmp_client) -> None:
     client, stub = mocked_mmp_client
 
+    reftime = datetime.now(timezone.utc)
+    reference_time_tuple = (reftime.year, reftime.month, reftime.day,
+                            reftime.hour, reftime.minute,
+                            reftime.second, reftime.microsecond)
     result = [ResponseType.SUCCESS.value,
-              (datetime.now(timezone.utc).isoformat(),
+              (reference_time_tuple,
                {'wallclock_time': [], 'simulation_time': []},
                None)]
     stub.call.return_value = msgpack.packb(result, use_bin_type=True)

From dddc630c0bc0059013e072e2541bf2dbf2109448 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 2 Sep 2022 11:03:06 +0200
Subject: [PATCH 036/183] Send reference wallclock time as timestamp

---
 .../python/libmuscle/manager/mmp_server.py    | 13 ++----
 .../manager/test/test_mmp_request_handler.py  | 40 ++++++++++++++++++-
 libmuscle/python/libmuscle/mmp_client.py      |  8 ++--
 .../python/libmuscle/test/test_mmp_client.py  |  6 +--
 4 files changed, 48 insertions(+), 19 deletions(-)

diff --git a/libmuscle/python/libmuscle/manager/mmp_server.py b/libmuscle/python/libmuscle/manager/mmp_server.py
index b237f316..793bbfc8 100644
--- a/libmuscle/python/libmuscle/manager/mmp_server.py
+++ b/libmuscle/python/libmuscle/manager/mmp_server.py
@@ -22,7 +22,6 @@
 
 _logger = logging.getLogger(__name__)
 
-_EncodedTimeType = Tuple[int, int, int, int, int, int, int]
 _EncodedCheckpointType = Dict[str, List[Dict[str, Any]]]
 
 
@@ -69,10 +68,8 @@ def __init__(
         self._configuration = configuration
         self._instance_registry = instance_registry
         self._topology_store = topology_store
-        reftime = datetime.now(timezone.utc)
-        self._reference_time_tuple = (reftime.year, reftime.month, reftime.day,
-                                      reftime.hour, reftime.minute,
-                                      reftime.second, reftime.microsecond)
+        self._reference_time = datetime.now(timezone.utc)
+        self._reference_timestamp = self._reference_time.timestamp()
 
     def handle_request(self, request: bytes) -> bytes:
         """Handles a manager request.
@@ -290,9 +287,7 @@ def _generate_peer_instances(
     def _get_checkpoint_info(
                 self,
                 instance: Reference
-                ) -> Tuple[_EncodedTimeType,
-                           _EncodedCheckpointType,
-                           Optional[str]]:
+                ) -> Tuple[float, _EncodedCheckpointType, Optional[str]]:
         """Get checkpoint info for an instance
 
         Args:
@@ -308,7 +303,7 @@ def _get_checkpoint_info(
         resume = None
         if instance in self._configuration.resume:
             resume = str(self._configuration.resume[instance])
-        return (self._reference_time_tuple,
+        return (self._reference_timestamp,
                 encode_checkpoints(self._configuration.checkpoints),
                 resume)
 
diff --git a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
index 733baa61..0d91c650 100644
--- a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
+++ b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
@@ -1,5 +1,8 @@
+from datetime import datetime, timezone
+from pathlib import Path
 import msgpack
-from ymmsl import Operator, Reference
+from ymmsl import (
+        Operator, Reference, Checkpoints, CheckpointRangeRule, CheckpointAtRule)
 
 from libmuscle.logging import LogLevel
 from libmuscle.manager.mmp_server import MMPRequestHandler
@@ -87,6 +90,41 @@ def test_register_instance(mmp_request_handler, instance_registry):
     assert registered_ports['test_instance'][0].operator == Operator.F_INIT
 
 
+def test_register_instance_checkpoint_info(
+        mmp_configuration, mmp_request_handler):
+    resume_path = Path('/path/to/resume.pack')
+    mmp_configuration.resume = {Reference('test_instance'): resume_path}
+    mmp_configuration.checkpoints = Checkpoints([CheckpointRangeRule(every=10),
+                                                 CheckpointAtRule([1, 2, 3.0])])
+
+    request = [
+            RequestType.REGISTER_INSTANCE.value,
+            'test_instance',
+            ['tcp://localhost:10000'],
+            [['test_in', 'F_INIT']]]
+    encoded_request = msgpack.packb(request, use_bin_type=True)
+
+    result = mmp_request_handler.handle_request(encoded_request)
+    decoded_result = msgpack.unpackb(result, raw=False)
+
+    assert decoded_result[0] == ResponseType.SUCCESS.value
+    timestamp, checkpoints, resume = decoded_result[1]
+
+    ref_time = datetime.fromtimestamp(timestamp, tz=timezone.utc)
+    assert ref_time == mmp_request_handler._reference_time
+
+    assert isinstance(checkpoints, dict)
+    assert checkpoints.keys() == {'wallclock_time', 'simulation_time'}
+    assert checkpoints['simulation_time'] == []
+    wallclock_time = checkpoints['wallclock_time']
+    assert len(wallclock_time) == 2
+    assert wallclock_time[0] == {'start': None, 'stop': None, 'every': 10}
+    assert wallclock_time[1] == {'at': [1, 2, 3.0]}
+
+    assert resume is not None
+    assert Path(resume) == resume_path
+
+
 def test_double_register_instance(mmp_request_handler):
     request = [
             RequestType.REGISTER_INSTANCE.value,
diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py
index f8f05330..6376aa20 100644
--- a/libmuscle/python/libmuscle/mmp_client.py
+++ b/libmuscle/python/libmuscle/mmp_client.py
@@ -60,15 +60,15 @@ def decode_checkpoint_rule(rule: Dict[str, Any]) -> CheckpointRule:
 
 
 def decode_checkpoint_info(
-        utc_walltime_reference: Tuple[int, int, int, int, int, int, int],
+        reference_timestamp: float,
         checkpoints_dict: Dict[str, List[Dict[str, Any]]],
         resume: Optional[str]
         ) -> Tuple[datetime, Checkpoints, Optional[Path]]:
     """Decode checkpoint info from a MsgPack-compatible value.
 
     Args:
-        utc_walltime_reference: tuple (year, month, day, hour, minute, second,
-            microsecond) in UTC timezone
+        reference_timestamp: seconds since UNIX epoch in UTC timezone to use as
+            wallclock_time = 0
         checkpoints_dict: dictionary of checkpoint definitions
         resume: optional string indicating resume path
 
@@ -77,7 +77,7 @@ def decode_checkpoint_info(
         checkpoints: checkpoint configuration
         resume: path to the resume snapshot
     """
-    ref_time = datetime(*utc_walltime_reference, tzinfo=timezone.utc)
+    ref_time = datetime.fromtimestamp(reference_timestamp, tz=timezone.utc)
     checkpoints = Checkpoints(
             wallclock_time=[decode_checkpoint_rule(rule)
                             for rule in checkpoints_dict["wallclock_time"]],
diff --git a/libmuscle/python/libmuscle/test/test_mmp_client.py b/libmuscle/python/libmuscle/test/test_mmp_client.py
index e93d6014..a47311a6 100644
--- a/libmuscle/python/libmuscle/test/test_mmp_client.py
+++ b/libmuscle/python/libmuscle/test/test_mmp_client.py
@@ -74,12 +74,8 @@ def test_get_settings(mocked_mmp_client) -> None:
 def test_register_instance(mocked_mmp_client) -> None:
     client, stub = mocked_mmp_client
 
-    reftime = datetime.now(timezone.utc)
-    reference_time_tuple = (reftime.year, reftime.month, reftime.day,
-                            reftime.hour, reftime.minute,
-                            reftime.second, reftime.microsecond)
     result = [ResponseType.SUCCESS.value,
-              (reference_time_tuple,
+              (datetime.now(timezone.utc).timestamp(),
                {'wallclock_time': [], 'simulation_time': []},
                None)]
     stub.call.return_value = msgpack.packb(result, use_bin_type=True)

From 23f7f873d273430b73d15d0407450c70b71f8cb5 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 8 Sep 2022 15:10:22 +0200
Subject: [PATCH 037/183] Implementation of workflow snapshot heuristic

---
 .../libmuscle/manager/snapshot_registry.py    | 438 ++++++++++++++++++
 .../manager/test/test_snapshot_registry.py    | 330 +++++++++++++
 2 files changed, 768 insertions(+)
 create mode 100644 libmuscle/python/libmuscle/manager/snapshot_registry.py
 create mode 100644 libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py

diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py
new file mode 100644
index 00000000..1fd4b12d
--- /dev/null
+++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py
@@ -0,0 +1,438 @@
+from dataclasses import dataclass, field
+from enum import Flag, auto
+from itertools import chain, zip_longest
+from operator import attrgetter
+from typing import Dict, Optional, Set, List, Tuple, TypeVar
+
+from ymmsl import Reference, Configuration, Identifier, Implementation
+from ymmsl import ImplementationState as IState
+
+from libmuscle.snapshot import SnapshotMetadata
+
+
+_SnapshotDictType = Dict[Reference, List["SnapshotNode"]]
+_ConnectionType = Tuple[Identifier, Identifier, "_ConnectionInfo"]
+_T = TypeVar("_T")
+
+
+def safe_get(lst: List[_T], index: int, default: _T) -> _T:
+    """Get an item from the list, returning default when it does not exist.
+
+    Args:
+        lst: List to get the item from
+        index: Which item to get, should be >= 0
+        default: Value to return when hitting an IndexError
+    """
+    try:
+        return lst[index]
+    except IndexError:
+        return default
+
+
+class _ConnectionInfo(Flag):
+    SELF_IS_SENDING = auto()
+    SELF_IS_VECTOR = auto()
+    PEER_IS_VECTOR = auto()
+
+
+def calc_consistency(num1: int, num2: int, first_is_sent: bool) -> bool:
+    """Calculate consistency of message counts.
+
+    Args:
+        num1: message count of instance 1
+        num2: message count of instance 2
+        first_is_sent: True iff instance 1 is sending messages over this conduit
+
+    Returns:
+        True iff the two message counts are consistent
+    """
+    return (num1 == num2 or                             # strong
+            num1 + 1 == num2 and first_is_sent or       # weak (1 = sent)
+            num2 + 1 == num1 and not first_is_sent)     # weak (2 = sent)
+
+
+def calc_consistency_list(
+        num1: List[int], num2: List[int], first_is_sent: bool) -> bool:
+    """Calculate consistency of message counts.
+
+    Args:
+        num1: message count of instance 1
+        num2: message count of instance 2
+        first_is_sent: True iff instance 1 is sending messages over this conduit
+
+    Returns:
+        True iff the two message counts are consistent
+    """
+    if first_is_sent:
+        slot_iter = zip_longest(num1, num2, fillvalue=0)
+    else:
+        slot_iter = zip_longest(num2, num1, fillvalue=0)
+    return all(slot_sent == slot_received or        # strong
+               slot_sent + 1 == slot_received       # weak
+               for slot_sent, slot_received in slot_iter)
+
+
+@dataclass
+class SnapshotNode:
+    """Represents a node in the snapshot graph.
+
+    Attributes:
+        num: The number of the snapshot. Unique for this instance. Later
+            snapshots always have a higher num.
+        instance: Which instance this is a snapshot of.
+        snapshot: The snapshot metadata reported by the instance.
+        stateful_peers: The set of peers that the instance is connected to that
+            have state, which we need to check consistency with.
+        consistent_peers: Keeps track of snapshots per peer that are consistent
+            with this one.
+    """
+    num: int
+    instance: Reference
+    snapshot: SnapshotMetadata
+    stateful_peers: Set[Reference]
+    consistent_peers: Dict[Reference, List["SnapshotNode"]] = field(
+            default_factory=dict, repr=False)
+
+    def __hash__(self) -> int:
+        return object.__hash__(self)
+
+    @property
+    def consistent(self) -> bool:
+        """Returns True iff there is a consistent checkpoint will all stateful
+        peers.
+        """
+        return self.consistent_peers.keys() == self.stateful_peers
+
+    def do_consistency_check(
+            self,
+            peer_node: "SnapshotNode",
+            connections: List[_ConnectionType]) -> bool:
+        """Check if the snapshot of the peer is consistent with us.
+
+        When the peer snapshot is consistent, adds it to our list of consistent
+        peer snapshots (in :attribute:`consistent_peers`) and vice versa.
+
+        Args:
+            peer_node: Snapshot of one of our peers
+            connections: All connections from our instance to the peer instance
+
+        Returns:
+            True iff the peer snapshot is consistent with ours.
+        """
+        i_snapshot = self.snapshot
+        p_snapshot = peer_node.snapshot
+        for connection in connections:
+            i_port, p_port, conn = connection
+            is_sending = bool(conn & _ConnectionInfo.SELF_IS_SENDING)
+            i_msg_counts = i_snapshot.port_message_counts.get(str(i_port), [])
+            p_msg_counts = p_snapshot.port_message_counts.get(str(p_port), [])
+            if conn & _ConnectionInfo.SELF_IS_VECTOR:
+                slot = int(peer_node.instance[-1])
+                consistent = calc_consistency(
+                        safe_get(i_msg_counts, slot, 0),
+                        safe_get(p_msg_counts, 0, 0),
+                        is_sending)
+            elif conn & _ConnectionInfo.PEER_IS_VECTOR:
+                slot = int(self.instance[-1])
+                consistent = calc_consistency(
+                        safe_get(i_msg_counts, 0, 0),
+                        safe_get(p_msg_counts, slot, 0),
+                        is_sending)
+            else:
+                consistent = calc_consistency_list(
+                        i_msg_counts, p_msg_counts, is_sending)
+            if not consistent:  # not consistent
+                return False
+        self.consistent_peers.setdefault(
+                peer_node.instance, []).append(peer_node)
+        peer_node.consistent_peers.setdefault(
+                self.instance, []).append(self)
+        return True
+
+
+class SnapshotRegistry:
+    """Registry of all snapshots taken by instances.
+
+    Current snapshots are stored in a graph. Every node represents a snapshot
+    taken by an instance (see :class:`SnapshotNode`). When snapshots from peer
+    instances are consistent, the nodes are connected to each other.
+
+    This class manages the snapshot nodes. New snapshots are registered through
+    :meth:`register_snapshot`.
+    """
+
+    def __init__(self, configuration: Configuration) -> None:
+        """Create a snapshot graph using provided configuration.
+
+        Args:
+            configuration: ymmsl configuration describing the workflow.
+        """
+        self._configuration = configuration
+
+        self._snapshots = {}                # type: _SnapshotDictType
+
+        self._instances = set()             # type: Set[Reference]
+        self._stateful_instances = set()    # type: Set[Reference]
+        for component in configuration.model.components:
+            instances = set(component.instances())
+            self._instances.update(instances)
+            if self._is_stateful(component.name):
+                self._stateful_instances.update(instances)
+
+    def register_snapshot(
+            self, instance: Reference, snapshot: SnapshotMetadata) -> None:
+        """Register a new snapshot.
+
+        Args:
+            instance: The instance that created the snapshot
+            snapshot: Metadata describing the snapshot
+        """
+        stateful_peers = self._get_stateful_peers(instance)
+
+        i_snapshots = self._snapshots.setdefault(instance, [])
+        # get next number of the snapshot
+        num = 1 if not i_snapshots else i_snapshots[-1].num + 1
+        snapshotnode = SnapshotNode(num, instance, snapshot, stateful_peers)
+        i_snapshots.append(snapshotnode)
+
+        # check consistency with all peers
+        for peer in stateful_peers:
+            for peer_snapshot in self._snapshots.get(peer, []):
+                snapshotnode.do_consistency_check(
+                        peer_snapshot, self._get_connections(instance, peer))
+
+        # finally, check if this snapshotnode is now part of a workflow snapshot
+        self._save_workflow_snapshot(snapshotnode)
+
+    def _save_workflow_snapshot(self, snapshotnode: SnapshotNode) -> None:
+        """Save snapshot if a workflow snapshot exists with the provided node.
+
+        Args:
+            snapshotnode: The snapshot node that must be part of the workflow
+                snapshot.
+        """
+        selected_snapshots = self._get_workflow_snapshot(snapshotnode)
+        if selected_snapshots is not None:
+            self._write_snapshot_ymmsl(selected_snapshots)
+            self._cleanup_snapshots(selected_snapshots)
+
+    def _get_workflow_snapshot(
+            self, snapshot: SnapshotNode) -> Optional[List[SnapshotNode]]:
+        """Check if a workflow snapshot exists that contains the provided node.
+
+        Note: if the provided snapshot node is part of multiple workflow
+        snapshots, only the most recent is detected and written to disk.
+
+        Args:
+            snapshotnode: The snapshot node that must be part of the workflow
+                snapshot.
+        """
+        # This implements a greedy assignment algorithm.
+        if not snapshot.consistent:
+            return None
+
+        # Instances that don't have a snapshot node chosen yet:
+        instances_to_cover = list(
+                self._stateful_instances - {snapshot.instance})
+        # Allowed snapshots per instance. This is updated during the heuristic
+        # to further restrict the sets of snapshots as peer snapshots are
+        # selected.
+        # First restriction is that the snapshots have to be locally consistent.
+        allowed_snapshots = {}  # type: Dict[Reference, Set[SnapshotNode]]
+        for instance in instances_to_cover:
+            allowed_snapshots[instance] = set(
+                    i_snapshot
+                    for i_snapshot in self._snapshots.get(instance, [])
+                    if i_snapshot.consistent)
+            if not allowed_snapshots[instance]:
+                # there cannot be a workflow snapshot if this instance has no
+                # consistent snapshot nodes
+                return None
+        instance = snapshot.instance
+        allowed_snapshots[instance] = {snapshot}
+
+        def num_allowed_snapshots(instance: Reference) -> int:
+            """Get number of allowed snapshots at this point for this instance.
+
+            The allowed snapshots are those that are consistent with all
+            selected snapshots at this point in the heuristic.
+            """
+            return len(allowed_snapshots[instance])
+
+        selected_snapshots = [snapshot]
+        # This stack stores history of allowed_snapshots and enables roll back
+        stack = []  # type: List[Dict[Reference, Set[SnapshotNode]]]
+
+        # update allowed_snapshots for peers
+        for peer, snapshots in snapshot.consistent_peers.items():
+            allowed_snapshots[peer].intersection_update(snapshots)
+            if not allowed_snapshots[peer]:
+                return None
+
+        while instances_to_cover:
+            # select most constrained instance
+            #
+            # Note: we're only interested in the instance with the least allowed
+            # snapshots. Better performance may be possible by not doing a full
+            # sort, but it should be tested. Expectation is that
+            # instances_to_cover remains mostly sorted (as the only counts that
+            # are changing are for peers of the previous selected instance).
+            # Python's sort algorithm is O(N) when the list is already sorted
+            # (which is the same as max()).
+            #
+            # We cannot use a priority queue (heapq) because
+            # num_allowed_snapshots is changing every iteration.
+            instances_to_cover.sort(key=num_allowed_snapshots, reverse=True)
+            instance = instances_to_cover.pop()
+
+            # select latest snapshot of this instance
+            snapshot = max(allowed_snapshots[instance], key=attrgetter("num"))
+            selected_snapshots.append(snapshot)
+            # we put a shallow copy on the stack, so we are not allowed to
+            # modify the sets in the dictionary (see below)
+            stack.append(allowed_snapshots.copy())
+
+            # update allowed snapshots with the currently selected
+            allowed_snapshots[instance] = {snapshot}
+            for peer, snapshots in snapshot.consistent_peers.items():
+                # not updating in place to preserve set objects in the stack
+                intersection = allowed_snapshots[peer].intersection(snapshots)
+                if not intersection:
+                    break  # roll back
+                allowed_snapshots[peer] = intersection
+            else:
+                # not rolling back, go into next iteration of the while-loop
+                continue
+
+            # roll back should stop when selected_snapshots only contains the
+            # one we forced to be part of the workflow snapshot
+            while len(selected_snapshots) > 1:
+                # roll back
+                snapshot = selected_snapshots.pop()
+                instance = snapshot.instance
+                instances_to_cover.append(instance)
+                allowed_snapshots = stack.pop()
+                allowed_snapshots[instance].remove(snapshot)
+                if allowed_snapshots[instance]:
+                    # we have a valid next snapshot to try for this instance
+                    break
+                # no allowed_snapshots, try another roll back
+            else:
+                # we've exhausted roll back possibilities, there is no
+                # consistent checkpoint
+                return None
+
+        return selected_snapshots
+
+    def _write_snapshot_ymmsl(
+            self, selected_snapshot: List[SnapshotNode]) -> None:
+        ...
+
+    def _cleanup_snapshots(
+            self, selected_snapshots: List[SnapshotNode]) -> None:
+        # remove all snapshots older than the selected ones
+        removed_snapshots = set()  # type: Set[SnapshotNode]
+        for snapshot in selected_snapshots:
+            all_snapshots = self._snapshots[snapshot.instance]
+            idx = all_snapshots.index(snapshot)
+            self._snapshots[snapshot.instance] = all_snapshots[idx:]
+            removed_snapshots.update(all_snapshots[:idx])
+        # remove all references in SnapshotNode.peer_snapshot to the snapshots
+        # that are cleaned up
+        for snapshot in removed_snapshots:
+            for peer_snapshot in chain.from_iterable(
+                    snapshot.consistent_peers.values()):
+                if peer_snapshot in removed_snapshots:
+                    # snapshot is removed anyway, no need to update references
+                    continue
+                # peer_snapshot is still there, remove reference to us
+                peer_snapshot.consistent_peers[snapshot.instance].remove(
+                        snapshot)
+
+    # TODO: add caching decorator or move into an instance variable
+    def _get_stateful_peers(self, instance: Reference) -> Set[Reference]:
+        peers = set()  # type: Set[Reference]
+        kernel = instance.without_trailing_ints()
+        index = [int(instance[i]) for i in range(len(kernel), len(instance))]
+        for conduit in self._configuration.model.conduits:
+            if conduit.sending_component() == kernel:
+                peer_kernel = conduit.receiving_component()
+            elif conduit.receiving_component() == kernel:
+                peer_kernel = conduit.sending_component()
+            else:
+                continue
+            if not self._is_stateful(peer_kernel):
+                continue
+            if len(index) == len(self._multiplicity(peer_kernel)):
+                # we must be sending to the peer with the same index as us
+                peers.add(peer_kernel + index)
+            elif len(index) + 1 == len(self._multiplicity(peer_kernel)):
+                # we are sending on a vector port, peer is receiving non-vector
+                # generate all peer indices
+                for i in range(self._multiplicity(peer_kernel)[-1]):
+                    peers.add(peer_kernel + index + i)
+            elif len(index) - 1 == len(self._multiplicity(peer_kernel)):
+                # we are sending to a vector port, strip last of our indices
+                peers.add(peer_kernel + index[:-1])
+        return peers
+
+    # TODO: add caching decorator or move into an instance variable
+    def _get_connections(self, instance: Reference, peer: Reference
+                         ) -> List[_ConnectionType]:
+        instance_kernel = instance.without_trailing_ints()
+        peer_kernel = peer.without_trailing_ints()
+
+        connected_ports = []  # type: List[_ConnectionType]
+        for conduit in self._configuration.model.conduits:
+            if (conduit.sending_component() == instance_kernel and
+                    conduit.receiving_component() == peer_kernel):
+                conn_type = _ConnectionInfo.SELF_IS_SENDING
+            elif (conduit.receiving_component() == instance_kernel and
+                    conduit.sending_component() == peer_kernel):
+                conn_type = _ConnectionInfo(0)
+            else:
+                continue
+            instance_ndim = (len(instance) - len(instance_kernel))
+            peer_ndim = (len(peer) - len(peer_kernel))
+            if instance_ndim < peer_ndim:
+                conn_type |= _ConnectionInfo.SELF_IS_VECTOR
+            if instance_ndim > peer_ndim:
+                conn_type |= _ConnectionInfo.PEER_IS_VECTOR
+            # we cannot distinguish scalar-scalar vs. vector-vector
+            # but it does not matter for this logic :)
+            if conn_type & _ConnectionInfo.SELF_IS_SENDING:
+                connected_ports.append((
+                        conduit.sending_port(),
+                        conduit.receiving_port(),
+                        conn_type))
+            else:
+                connected_ports.append((
+                        conduit.receiving_port(),
+                        conduit.sending_port(),
+                        conn_type))
+        return connected_ports
+
+    # TODO: add caching decorator or move into an instance variable
+    def _multiplicity(self, kernel: Reference) -> List[int]:
+        for component in self._configuration.model.components:
+            if component.name == kernel:
+                return component.multiplicity
+        raise KeyError(str(kernel))
+
+    # TODO: add caching decorator or move into an instance variable
+    def _implementation(self, kernel: Reference) -> Optional[Implementation]:
+        implementation = None
+        for component in self._configuration.model.components:
+            if component.name == kernel:
+                implementation = component.implementation
+        if implementation in self._configuration.implementations:
+            return self._configuration.implementations[implementation]
+        return None
+
+    def _is_stateful(self, kernel: Reference) -> bool:
+        implementation = self._implementation(kernel)
+        if implementation is None:
+            return True  # assume stateful
+        return (implementation.stateful is IState.STATEFUL or
+                implementation.stateful is IState.WEAKLY_STATEFUL and
+                implementation.supports_checkpoint)
diff --git a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py
new file mode 100644
index 00000000..7485b0a8
--- /dev/null
+++ b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py
@@ -0,0 +1,330 @@
+from unittest.mock import MagicMock
+
+import pytest
+from libmuscle.snapshot import SnapshotMetadata
+from ymmsl import (
+        Configuration, Model, Component, Conduit, Implementation,
+        ImplementationState as IState, Reference)
+
+from libmuscle.manager.snapshot_registry import (
+    SnapshotRegistry, calc_consistency, calc_consistency_list, safe_get,
+    _ConnectionInfo)
+
+
+def make_snapshot(**msg_counts) -> SnapshotMetadata:
+    return SnapshotMetadata([], 0, 0, 0, {**msg_counts}, False, '')
+
+
+@pytest.fixture(params=[True, False])
+def micro_is_stateless(request: pytest.FixtureRequest) -> bool:
+    return request.param
+
+
+@pytest.fixture
+def macro_micro(micro_is_stateless: bool) -> Configuration:
+    components = [
+            Component('macro', 'macro_impl'),
+            Component('micro', 'micro_impl')]
+    conduits = [
+            Conduit('macro.o_i', 'micro.f_i'),
+            Conduit('micro.o_f', 'macro.s')]
+    model = Model('macro_micro', components, conduits)
+
+    if micro_is_stateless:
+        micro_impl = Implementation(
+                'micro_impl', stateful=IState.STATELESS, executable='pass')
+    else:
+        micro_impl = Implementation(
+                'micro_impl', supports_checkpoint=True, executable='pass')
+
+    implementations = [
+            Implementation(
+                    'macro_impl', supports_checkpoint=True, executable='pass'),
+            micro_impl]
+
+    return Configuration(model, implementations=implementations)
+
+
+@pytest.fixture
+def uq(macro_micro: Configuration) -> Configuration:
+    for component in macro_micro.model.components:
+        component.multiplicity = [5]
+    macro_micro.model.components.append(Component('qmc', 'qmc_impl'))
+    macro_micro.model.components.append(Component('rr', 'rr_impl'))
+    macro_micro.model.conduits.extend([
+            Conduit('qmc.parameters_out', 'rr.front_in'),
+            Conduit('rr.front_out', 'qmc.states_in'),
+            Conduit('rr.back_out', 'macro.muscle_settings_in'),
+            Conduit('macro.final_state_out', 'rr.back_in')])
+    macro_micro.implementations[Reference('qmc_impl')] = Implementation(
+            'qmc_impl', supports_checkpoint=True, executable='pass')
+    macro_micro.implementations[Reference('rr_impl')] = Implementation(
+            'rr_impl', supports_checkpoint=True, executable='pass')
+    return macro_micro
+
+
+def test_safe_get() -> None:
+    assert safe_get([], 0, 1) == 1
+    assert safe_get([3], 0, 1) == 3
+    assert safe_get([3], 1, 5) == 5
+    for i in range(10):
+        expected = -1 if i >= 3 else i + 3
+        assert safe_get([3, 4, 5], i, -1) == expected
+
+
+def test_calc_consistency() -> None:
+    num_sent = 3
+    for num_received in [2, 3, 4, 5]:
+        consistent = num_received in [3, 4]
+        assert calc_consistency(num_sent, num_received, True) is consistent
+        assert calc_consistency(num_received, num_sent, False) is consistent
+
+    num_received = 10
+    for num_sent in [8, 9, 10, 11]:
+        consistent = num_sent in [9, 10]
+        assert calc_consistency(num_sent, num_received, True) is consistent
+        assert calc_consistency(num_received, num_sent, False) is consistent
+
+
+def test_calc_consistency_list() -> None:
+    num_sent = [3, 3]
+    for num_received in [[2, 3], [3, 2], [3, 5], [], [4, 4, 0, 0, 2]]:
+        assert not calc_consistency_list(num_sent, num_received, True)
+        assert not calc_consistency_list(num_received, num_sent, False)
+    for num_received in [[3, 3], [3, 4], [4, 3], [4, 4],
+                         [3, 3, 1], [4, 4, 0, 0, 0, 1, 0, 1]]:
+        assert calc_consistency_list(num_sent, num_received, True)
+        assert calc_consistency_list(num_received, num_sent, False)
+
+
+def test_stateful_peers(uq: Configuration, micro_is_stateless: bool) -> None:
+    snapshot_registry = SnapshotRegistry(uq)
+    macro = Reference('macro')
+    micro = Reference('micro')
+    qmc = Reference('qmc')
+    rr = Reference('rr')
+
+    expected_stateful = {qmc, rr} | {macro + i for i in range(5)}
+    if not micro_is_stateless:
+        expected_stateful.update(micro + i for i in range(5))
+    assert snapshot_registry._stateful_instances == expected_stateful
+
+    assert snapshot_registry._get_stateful_peers(qmc) == {rr}
+    expected_rr_peers = {qmc} | {macro + i for i in range(5)}
+    assert snapshot_registry._get_stateful_peers(rr) == expected_rr_peers
+    for i in range(5):
+        expected_peers = {rr} if micro_is_stateless else {rr, micro + i}
+        assert snapshot_registry._get_stateful_peers(macro + i) == expected_peers
+        assert snapshot_registry._get_stateful_peers(micro + i) == {macro + i}
+
+
+def test_connections(uq: Configuration) -> None:
+    snapshot_registry = SnapshotRegistry(uq)
+    macro = Reference('macro')
+    micro = Reference('micro')
+    qmc = Reference('qmc')
+    rr = Reference('rr')
+
+    assert not snapshot_registry._get_connections(qmc, macro + 1)
+    assert not snapshot_registry._get_connections(macro + 3, qmc)
+    assert not snapshot_registry._get_connections(qmc, micro + 0)
+    assert not snapshot_registry._get_connections(micro + 1, qmc)
+    assert not snapshot_registry._get_connections(rr, micro + 4)
+    assert not snapshot_registry._get_connections(micro + 0, rr)
+
+    connections = snapshot_registry._get_connections(rr, qmc)
+    assert len(connections) == 2
+    for rr_port, qmc_port, info in connections:
+        assert rr_port in (Reference('front_out'), Reference('front_in'))
+        assert qmc_port in (Reference('parameters_out'), Reference('states_in'))
+        is_sending = bool(info & _ConnectionInfo.SELF_IS_SENDING)
+        assert is_sending is (rr_port == Reference('front_out'))
+        # Note: actually both are vector ports, but this is undetectable from
+        # the ymmsl configuration. Luckily we treat it the same as scalar-scalar
+        assert not (info & _ConnectionInfo.SELF_IS_VECTOR)
+        assert not (info & _ConnectionInfo.PEER_IS_VECTOR)
+
+    connections = snapshot_registry._get_connections(macro + 0, rr)
+    assert len(connections) == 2
+    for macro_port, rr_port, info in connections:
+        assert macro_port in (
+                Reference('muscle_settings_in'), Reference('final_state_out'))
+        assert rr_port in (Reference('back_out'), Reference('back_in'))
+        is_sending = bool(info & _ConnectionInfo.SELF_IS_SENDING)
+        assert is_sending is (macro_port == Reference('final_state_out'))
+        assert not (info & _ConnectionInfo.SELF_IS_VECTOR)
+        assert (info & _ConnectionInfo.PEER_IS_VECTOR)
+
+    connections = snapshot_registry._get_connections(rr, macro + 1)
+    assert len(connections) == 2
+    for rr_port, macro_port, info in connections:
+        assert macro_port in (
+                Reference('muscle_settings_in'), Reference('final_state_out'))
+        assert rr_port in (Reference('back_out'), Reference('back_in'))
+        is_sending = bool(info & _ConnectionInfo.SELF_IS_SENDING)
+        assert is_sending is (rr_port == Reference('back_out'))
+        assert (info & _ConnectionInfo.SELF_IS_VECTOR)
+        assert not (info & _ConnectionInfo.PEER_IS_VECTOR)
+
+
+def test_macro_micro_snapshots(
+        macro_micro: Configuration, micro_is_stateless: bool) -> None:
+    snapshot_registry = SnapshotRegistry(macro_micro)
+    # prevent actually writing a ymmsl file, testing that separately
+    snapshot_registry._write_snapshot_ymmsl = MagicMock()
+    macro = Reference('macro')
+    micro = Reference('micro')
+
+    macro_snapshot = make_snapshot(o_i=[3], s=[3])
+    snapshot_registry.register_snapshot(macro, macro_snapshot)
+
+    assert len(snapshot_registry._snapshots[macro]) == 1
+    node = snapshot_registry._snapshots[macro][0]
+    assert node.consistent is micro_is_stateless
+    assert node.consistent_peers == {}
+    assert node.instance == macro
+    assert node.num == 1
+    assert node.snapshot is macro_snapshot
+    if micro_is_stateless:
+        assert node.stateful_peers == set()
+        snapshot_registry._write_snapshot_ymmsl.assert_called_once_with([node])
+        snapshot_registry._write_snapshot_ymmsl.reset_mock()
+    else:
+        assert node.stateful_peers == {micro}
+        snapshot_registry._write_snapshot_ymmsl.assert_not_called()
+
+    if not micro_is_stateless:
+        # Note: this snapshot is not realistic, it should have come in before
+        # the macro snapshot above. However, it's still useful for testing the
+        # consistency algorithm
+        micro_snapshot = make_snapshot(f_i=[2], o_f=[1])
+        snapshot_registry.register_snapshot(micro, micro_snapshot)
+
+        assert len(snapshot_registry._snapshots[micro]) == 1
+        assert not snapshot_registry._snapshots[micro][0].consistent
+        snapshot_registry._write_snapshot_ymmsl.assert_not_called()
+
+        micro_snapshot = make_snapshot(f_i=[3], o_f=[2])
+        snapshot_registry.register_snapshot(micro, micro_snapshot)
+
+        # micro snapshots should be cleaned up now!
+        assert len(snapshot_registry._snapshots[micro]) == 1
+        micro_node = snapshot_registry._snapshots[micro][0]
+        assert micro_node.consistent
+        snapshot_registry._write_snapshot_ymmsl.assert_called_with(
+                [micro_node, node])
+        snapshot_registry._write_snapshot_ymmsl.reset_mock()
+
+        micro_snapshot = make_snapshot(f_i=[4], o_f=[3])
+        snapshot_registry.register_snapshot(micro, micro_snapshot)
+
+        # micro snapshots should be cleaned up now!
+        assert len(snapshot_registry._snapshots[micro]) == 1
+        micro_node = snapshot_registry._snapshots[micro][0]
+        assert micro_node.consistent
+        snapshot_registry._write_snapshot_ymmsl.assert_called_with(
+                [micro_node, node])
+        snapshot_registry._write_snapshot_ymmsl.reset_mock()
+
+    macro_snapshot = make_snapshot(o_i=[4], s=[4])
+    snapshot_registry.register_snapshot(macro, macro_snapshot)
+    snapshot_registry._write_snapshot_ymmsl.assert_called_once()
+
+
+def test_uq(uq: Configuration, micro_is_stateless: bool) -> None:
+    snapshot_registry = SnapshotRegistry(uq)
+    # prevent actually writing a ymmsl file, testing that separately
+    snapshot_registry._write_snapshot_ymmsl = MagicMock()
+    macro = Reference('macro')
+    micro = Reference('micro')
+    qmc = Reference('qmc')
+    rr = Reference('rr')
+
+    qmc_snapshot = make_snapshot(parameters_out=[], states_in=[])
+    snapshot_registry.register_snapshot(qmc, qmc_snapshot)
+
+    rr_snapshot = make_snapshot(
+            front_in=[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
+            front_out=[0] * 10,
+            back_out=[1, 1, 1, 1, 1],
+            back_in=[0] * 5)
+    snapshot_registry.register_snapshot(rr, rr_snapshot)
+    node = snapshot_registry._snapshots[rr][-1]
+    assert qmc in node.consistent_peers
+    snapshot_registry._write_snapshot_ymmsl.assert_not_called()
+
+    macro_snapshot = make_snapshot(
+            muscle_settings_in=[1], final_state_out=[0], o_i=[0], s=[0])
+    for i in range(5):
+        snapshot_registry.register_snapshot(macro + i, macro_snapshot)
+        node = snapshot_registry._snapshots[macro + i][-1]
+        assert node.consistent_peers.keys() == {rr}
+        if micro_is_stateless and i == 4:
+            snapshot_registry._write_snapshot_ymmsl.assert_called_once()
+            snapshot_registry._write_snapshot_ymmsl.reset_mock()
+        else:
+            snapshot_registry._write_snapshot_ymmsl.assert_not_called()
+
+    if not micro_is_stateless:
+        micro_snapshot = make_snapshot(f_i=[1], o_f=[0])
+        for i in range(5):
+            snapshot_registry.register_snapshot(micro + i, micro_snapshot)
+            node = snapshot_registry._snapshots[micro + i][-1]
+            assert node.consistent_peers.keys() == {macro + i}
+            if i == 4:
+                snapshot_registry._write_snapshot_ymmsl.assert_called_once()
+                snapshot_registry._write_snapshot_ymmsl.reset_mock()
+            else:
+                snapshot_registry._write_snapshot_ymmsl.assert_not_called()
+
+    qmc_snapshot = make_snapshot(parameters_out=[1, 1, 1, 1, 1], states_in=[])
+    snapshot_registry.register_snapshot(qmc, qmc_snapshot)
+    node = snapshot_registry._snapshots[qmc][-1]
+    assert node.consistent_peers.keys() == {rr}
+    snapshot_registry._write_snapshot_ymmsl.assert_called_once()
+    snapshot_registry._write_snapshot_ymmsl.reset_mock()
+    assert len(snapshot_registry._snapshots[qmc]) == 1  # previous is cleaned up
+
+
+def test_heuristic_rollbacks() -> None:
+    components = [Component(f'comp{i}', f'impl{i}') for i in range(4)]
+    conduits = [Conduit(f'comp{i}.o_f', f'comp{i+1}.f_i') for i in range(3)]
+    model = Model('linear', components, conduits)
+    implementations = [
+            Implementation(f'impl{i}', supports_checkpoint=True, script='xyz')
+            for i in range(4)]
+    config = Configuration(model, implementations=implementations)
+
+    comp1, comp2, comp3, comp4 = (Reference(f'comp{i}') for i in range(4))
+
+    snapshot_registry = SnapshotRegistry(config)
+    # prevent actually writing a ymmsl file, testing that separately
+    snapshot_registry._write_snapshot_ymmsl = MagicMock()
+
+    for i in range(4):
+        snapshot_registry.register_snapshot(comp1, make_snapshot(o_f=[i]))
+    assert len(snapshot_registry._snapshots[comp1]) == 4
+
+    for i in range(10):
+        snapshot_registry.register_snapshot(
+                comp2, make_snapshot(f_i=[1], o_f=[0]))
+        snapshot_registry.register_snapshot(
+                comp3, make_snapshot(f_i=[1], o_f=[0]))
+    assert len(snapshot_registry._snapshots[comp2]) == 10
+    assert len(snapshot_registry._snapshots[comp3]) == 10
+
+    snapshot_registry.register_snapshot(comp2, make_snapshot(f_i=[2], o_f=[1]))
+    assert len(snapshot_registry._snapshots[comp2]) == 11
+    snapshot_registry.register_snapshot(comp2, make_snapshot(f_i=[3], o_f=[2]))
+    assert len(snapshot_registry._snapshots[comp2]) == 12
+
+    snapshot_registry._write_snapshot_ymmsl.assert_not_called()
+
+    snapshot_registry.register_snapshot(
+            comp4, make_snapshot(f_i=[1]))
+    snapshot_registry._write_snapshot_ymmsl.assert_called()
+
+    assert len(snapshot_registry._snapshots[comp1]) == 2
+    assert len(snapshot_registry._snapshots[comp2]) == 2
+    assert len(snapshot_registry._snapshots[comp3]) == 1
+    assert len(snapshot_registry._snapshots[comp4]) == 1

From 13b9cee722011cf3d7e026c909d646b40c1f97c7 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 8 Sep 2022 16:17:01 +0200
Subject: [PATCH 038/183] Additional snapshot registry tests

---
 .../manager/test/test_snapshot_registry.py    | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py
index 7485b0a8..d9d83068 100644
--- a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py
+++ b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py
@@ -167,6 +167,35 @@ def test_connections(uq: Configuration) -> None:
         assert not (info & _ConnectionInfo.PEER_IS_VECTOR)
 
 
+def test_multiplicity(uq: Configuration) -> None:
+    snapshot_registry = SnapshotRegistry(uq)
+    assert snapshot_registry._multiplicity(Reference('qmc')) == []
+    assert snapshot_registry._multiplicity(Reference('rr')) == []
+    assert snapshot_registry._multiplicity(Reference('macro')) == [5]
+    assert snapshot_registry._multiplicity(Reference('micro')) == [5]
+
+
+def test_implementation(uq: Configuration) -> None:
+    snapshot_registry = SnapshotRegistry(uq)
+
+    qmc_impl = snapshot_registry._implementation(Reference('qmc'))
+    assert qmc_impl.name == 'qmc_impl'
+
+    missing_impl = snapshot_registry._implementation(Reference('missing'))
+    assert missing_impl is None
+
+
+def test_stateful(uq: Configuration, micro_is_stateless: bool) -> None:
+    uq.implementations['macro_impl'].stateful = IState.WEAKLY_STATEFUL
+    snapshot_registry = SnapshotRegistry(uq)
+
+    assert snapshot_registry._is_stateful(Reference('macro'))
+    stateful = snapshot_registry._is_stateful(Reference('micro'))
+    assert stateful is not micro_is_stateless
+
+    assert snapshot_registry._is_stateful(Reference('unknown'))
+
+
 def test_macro_micro_snapshots(
         macro_micro: Configuration, micro_is_stateless: bool) -> None:
     snapshot_registry = SnapshotRegistry(macro_micro)

From 1b8c97ad451b7cd3aa0f5fa2c77ca5999346b32e Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 9 Sep 2022 15:41:41 +0200
Subject: [PATCH 039/183] Add caching decorators and more docstrings.

---
 .../libmuscle/manager/snapshot_registry.py    | 69 +++++++++++++++++--
 1 file changed, 65 insertions(+), 4 deletions(-)

diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py
index 1fd4b12d..49c5351b 100644
--- a/libmuscle/python/libmuscle/manager/snapshot_registry.py
+++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass, field
 from enum import Flag, auto
+from functools import lru_cache
 from itertools import chain, zip_longest
 from operator import attrgetter
 from typing import Dict, Optional, Set, List, Tuple, TypeVar
@@ -330,6 +331,11 @@ def _write_snapshot_ymmsl(
 
     def _cleanup_snapshots(
             self, selected_snapshots: List[SnapshotNode]) -> None:
+        """Remove all snapshots that are older than the selected snapshots.
+
+        Args:
+            selected_snapshots: All snapshot nodes of a workflow snapshot
+        """
         # remove all snapshots older than the selected ones
         removed_snapshots = set()  # type: Set[SnapshotNode]
         for snapshot in selected_snapshots:
@@ -349,8 +355,20 @@ def _cleanup_snapshots(
                 peer_snapshot.consistent_peers[snapshot.instance].remove(
                         snapshot)
 
-    # TODO: add caching decorator or move into an instance variable
+    @lru_cache(maxsize=None)
     def _get_stateful_peers(self, instance: Reference) -> Set[Reference]:
+        """Return the set of stateful peers for the given instance.
+
+        Note: instance is assumed to contain the full index, not just the kernel
+        name.
+
+        Args:
+            instance: Instance to get stateful peers of. See
+                :meth:`_is_stateful`.
+
+        Returns:
+            Set with all stateful peer instances (including their index).
+        """
         peers = set()  # type: Set[Reference]
         kernel = instance.without_trailing_ints()
         index = [int(instance[i]) for i in range(len(kernel), len(instance))]
@@ -376,9 +394,30 @@ def _get_stateful_peers(self, instance: Reference) -> Set[Reference]:
                 peers.add(peer_kernel + index[:-1])
         return peers
 
-    # TODO: add caching decorator or move into an instance variable
+    @lru_cache(maxsize=None)
     def _get_connections(self, instance: Reference, peer: Reference
                          ) -> List[_ConnectionType]:
+        """Get the list of connections between instance and peer.
+
+        Args:
+            instance: Instance reference (including index)
+            peer: Peer reference (including index)
+
+        Returns:
+            A list of tuples describing all conduits between instance and peer:
+                instance_port (Reference): the port of instance that is
+                    connected to
+                peer_port (Reference): the port on the peer instance
+                info (_ConnectionInfo): flag describing the connection. The
+                    instance is sending when
+                    ``info & _ConnectionInfo.SELF_IS_SENDING`` and receiving
+                    otherwise. When the instance port is a vector port and the
+                    peer port is a non-vector port, the flag
+                    ``_ConnectionInfo.SELF_IS_VECTOR`` is set. In the reverse
+                    situation the flag ``_ConnectionInfo.PEER_IS_VECTOR`` is
+                    set. When both ports are vector or non-vector, neither flag
+                    is set.
+        """
         instance_kernel = instance.without_trailing_ints()
         peer_kernel = peer.without_trailing_ints()
 
@@ -412,15 +451,26 @@ def _get_connections(self, instance: Reference, peer: Reference
                         conn_type))
         return connected_ports
 
-    # TODO: add caching decorator or move into an instance variable
+    @lru_cache(maxsize=None)
     def _multiplicity(self, kernel: Reference) -> List[int]:
+        """Return the multiplicity of a kernel
+        """
         for component in self._configuration.model.components:
             if component.name == kernel:
                 return component.multiplicity
         raise KeyError(str(kernel))
 
-    # TODO: add caching decorator or move into an instance variable
+    @lru_cache(maxsize=None)
     def _implementation(self, kernel: Reference) -> Optional[Implementation]:
+        """Return the implementation of a kernel.
+
+        Args:
+            kernel: The kernel to get the implementation for.
+
+        Returns:
+            Implementation for the kernel, or None if not provided in the
+            configuration.
+        """
         implementation = None
         for component in self._configuration.model.components:
             if component.name == kernel:
@@ -429,7 +479,18 @@ def _implementation(self, kernel: Reference) -> Optional[Implementation]:
             return self._configuration.implementations[implementation]
         return None
 
+    @lru_cache(maxsize=None)
     def _is_stateful(self, kernel: Reference) -> bool:
+        """Check if a kernel has a stateful implementation.
+
+        A kernel is considered stateful if:
+        - There is no Implementation given for the kernel
+        - Implementation.stateful = ImplementationState.STATEFUL
+        - Implementation.stateful = ImplementationState.WEAKLY_STATEFUL and the
+            implementation supports checkpointing. In this case we assume to get
+            snapshots from these kernels and we take them into account in the
+            snapshot graph.
+        """
         implementation = self._implementation(kernel)
         if implementation is None:
             return True  # assume stateful

From e244bdf340d2f10d122877669670d733e84d4934 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 9 Sep 2022 15:42:44 +0200
Subject: [PATCH 040/183] Add logic for storing snapshot ymmsl

---
 .../libmuscle/manager/snapshot_registry.py    | 78 +++++++++++++++++--
 .../manager/test/test_snapshot_registry.py    | 76 +++++++++++++++---
 2 files changed, 140 insertions(+), 14 deletions(-)

diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py
index 49c5351b..9fb53d69 100644
--- a/libmuscle/python/libmuscle/manager/snapshot_registry.py
+++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py
@@ -1,16 +1,21 @@
 from dataclasses import dataclass, field
+from datetime import datetime
 from enum import Flag, auto
 from functools import lru_cache
 from itertools import chain, zip_longest
 from operator import attrgetter
+from pathlib import Path
 from typing import Dict, Optional, Set, List, Tuple, TypeVar
 
-from ymmsl import Reference, Configuration, Identifier, Implementation
-from ymmsl import ImplementationState as IState
+from ymmsl import (
+        Reference, Configuration, Identifier, Implementation, save,
+        PartialConfiguration, ImplementationState as IState)
 
 from libmuscle.snapshot import SnapshotMetadata
 
 
+_MAX_FILE_EXISTS_CHECK = 100
+
 _SnapshotDictType = Dict[Reference, List["SnapshotNode"]]
 _ConnectionType = Tuple[Identifier, Identifier, "_ConnectionInfo"]
 _T = TypeVar("_T")
@@ -162,13 +167,15 @@ class SnapshotRegistry:
     :meth:`register_snapshot`.
     """
 
-    def __init__(self, configuration: Configuration) -> None:
+    def __init__(
+            self, configuration: Configuration, snapshot_folder: Path) -> None:
         """Create a snapshot graph using provided configuration.
 
         Args:
             configuration: ymmsl configuration describing the workflow.
         """
         self._configuration = configuration
+        self._snapshot_folder = snapshot_folder
 
         self._snapshots = {}                # type: _SnapshotDictType
 
@@ -326,8 +333,69 @@ def num_allowed_snapshots(instance: Reference) -> int:
         return selected_snapshots
 
     def _write_snapshot_ymmsl(
-            self, selected_snapshot: List[SnapshotNode]) -> None:
-        ...
+            self, selected_snapshots: List[SnapshotNode]) -> None:
+        """Write the snapshot ymmsl file to the snapshot folder.
+
+        Args:
+            selected_snapshots: All snapshot nodes of the workflow snapshot.
+        """
+        now = datetime.now()
+        config = self._generate_snapshot_config(selected_snapshots, now)
+        time = now.strftime('%Y%m%d_%H%M%S')
+        for i in range(_MAX_FILE_EXISTS_CHECK):
+            if i:
+                snapshot_filename = f'snapshot_{time}_{i}.ymmsl'
+            else:
+                snapshot_filename = f'snapshot_{time}.ymmsl'
+            savepath = self._snapshot_folder / snapshot_filename
+            if not savepath.exists():
+                save(config, savepath)
+                return
+        raise RuntimeError('Could not find an available filename for storing'
+                           f' the next workflow snapshot: {savepath} already'
+                           ' exists.')
+
+    def _generate_snapshot_config(
+                self, selected_snapshots: List[SnapshotNode], now: datetime
+                ) -> PartialConfiguration:
+        """Generate ymmsl configuration for snapshot file
+        """
+        selected_snapshots.sort(key=attrgetter('instance'))
+        resume = {}
+        for node in selected_snapshots:
+            resume[node.instance] = Path(node.snapshot.snapshot_filename)
+        description = self._generate_description(selected_snapshots, now)
+        return PartialConfiguration(resume=resume, description=description)
+
+    def _generate_description(
+            self, selected_snapshots: List[SnapshotNode], now: datetime) -> str:
+        """Generate a human-readable description of the workflow snapshot.
+        """
+        triggers = {}   # type: Dict[str, List[str]]
+        component_info = []
+        max_instance_len = len('Instance ')
+        for node in selected_snapshots:
+            for trigger in node.snapshot.triggers:
+                triggers.setdefault(trigger, []).append(str(node.instance))
+            component_info.append((
+                    str(node.instance),
+                    f'{node.snapshot.timestamp:<11.6g}',
+                    f'{node.snapshot.wallclock_time:<11.6g}'))
+            max_instance_len = max(max_instance_len, len(str(node.instance)))
+        instance_with_padding = 'Instance'.ljust(max_instance_len)
+        component_table = [
+                f'{instance_with_padding} t           wallclock time',
+                f'{"-" * (max_instance_len + 27)}']
+        component_table += [
+                f'{name.ljust(max_instance_len)} {timestamp} {walltime}'
+                for name, timestamp, walltime in component_info]
+        return (f'Workflow snapshot for {self._configuration.model.name}'
+                f' taken on {now.strftime("%Y-%m-%d %H:%M:%S")}.\n'
+                'Snapshot triggers:\n' +
+                '\n'.join(f'- {trigger} ({", ".join(triggers[trigger])})'
+                          for trigger in sorted(triggers)) +
+                '\n\n' +
+                '\n'.join(component_table))
 
     def _cleanup_snapshots(
             self, selected_snapshots: List[SnapshotNode]) -> None:
diff --git a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py
index d9d83068..a7f48656 100644
--- a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py
+++ b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py
@@ -1,3 +1,5 @@
+from datetime import datetime, timedelta
+from pathlib import Path
 from unittest.mock import MagicMock
 
 import pytest
@@ -7,7 +9,7 @@
         ImplementationState as IState, Reference)
 
 from libmuscle.manager.snapshot_registry import (
-    SnapshotRegistry, calc_consistency, calc_consistency_list, safe_get,
+    SnapshotNode, SnapshotRegistry, calc_consistency, calc_consistency_list, safe_get,
     _ConnectionInfo)
 
 
@@ -97,8 +99,64 @@ def test_calc_consistency_list() -> None:
         assert calc_consistency_list(num_received, num_sent, False)
 
 
+def test_write_ymmsl(tmp_path: Path):
+    snapshot_registry = SnapshotRegistry(
+            Configuration(Model('empty', [])), tmp_path)
+    snapshot_registry._write_snapshot_ymmsl([])
+
+    paths = list(tmp_path.iterdir())
+    assert len(paths) == 1
+    assert paths[0].suffix == ".ymmsl"
+    paths[0].unlink()
+
+    now = datetime.now()
+    for seconds in range(3):
+        time = (now + timedelta(seconds=seconds)).strftime("%Y%m%d_%H%M%S")
+        (tmp_path / f'snapshot_{time}.ymmsl').touch()
+    snapshot_registry._write_snapshot_ymmsl([])
+    paths = list(tmp_path.iterdir())
+    assert len(paths) == 4
+    paths = list(tmp_path.glob('*_1.ymmsl'))
+    assert len(paths) == 1
+
+
+def test_snapshot_config():
+    snapshot_registry = SnapshotRegistry(
+            Configuration(Model('empty', [])), None)
+    micro_metadata = SnapshotMetadata(
+            ['simulation_time >= 24.0', 'wallclocktime >= 10'],
+            10.123456789, 24.3456789, None, {}, False, 'micro_snapshot')
+    macro_metadata = SnapshotMetadata(
+            ['simulation_time >= 12.0', 'wallclocktime >= 10'],
+            10.123456789, 12.3456789, None, {}, False, 'macro_snapshot')
+    snapshots = [
+            SnapshotNode(1, Reference('micro'), micro_metadata, set()),
+            SnapshotNode(1, Reference('macro'), macro_metadata, set())]
+
+    now = datetime.now()
+    config = snapshot_registry._generate_snapshot_config(snapshots, now)
+    assert len(config.resume) == 2
+    assert config.resume[Reference('macro')] == Path('macro_snapshot')
+    assert config.resume[Reference('micro')] == Path('micro_snapshot')
+    # note: no automatic testing for formatting, should verify by eye if this
+    # looks okay..
+    print(config.description)
+
+    long_metadata = SnapshotMetadata(
+            ['simulation_time >= 24.0'], 1.23456789e-10, 1.23456789e10, None,
+            {}, False, '/this/is/a/long/path/to/the/snapshot/file.pack')
+    snapshots.append(SnapshotNode(
+            1, Reference('this.is.a.long.reference[10]'), long_metadata, set()))
+
+    config = snapshot_registry._generate_snapshot_config(snapshots, now)
+    assert len(config.resume) == 3
+    assert config.resume[Reference('this.is.a.long.reference[10]')] == Path(
+            '/this/is/a/long/path/to/the/snapshot/file.pack')
+    print(config.description)
+
+
 def test_stateful_peers(uq: Configuration, micro_is_stateless: bool) -> None:
-    snapshot_registry = SnapshotRegistry(uq)
+    snapshot_registry = SnapshotRegistry(uq, None)
     macro = Reference('macro')
     micro = Reference('micro')
     qmc = Reference('qmc')
@@ -119,7 +177,7 @@ def test_stateful_peers(uq: Configuration, micro_is_stateless: bool) -> None:
 
 
 def test_connections(uq: Configuration) -> None:
-    snapshot_registry = SnapshotRegistry(uq)
+    snapshot_registry = SnapshotRegistry(uq, None)
     macro = Reference('macro')
     micro = Reference('micro')
     qmc = Reference('qmc')
@@ -168,7 +226,7 @@ def test_connections(uq: Configuration) -> None:
 
 
 def test_multiplicity(uq: Configuration) -> None:
-    snapshot_registry = SnapshotRegistry(uq)
+    snapshot_registry = SnapshotRegistry(uq, None)
     assert snapshot_registry._multiplicity(Reference('qmc')) == []
     assert snapshot_registry._multiplicity(Reference('rr')) == []
     assert snapshot_registry._multiplicity(Reference('macro')) == [5]
@@ -176,7 +234,7 @@ def test_multiplicity(uq: Configuration) -> None:
 
 
 def test_implementation(uq: Configuration) -> None:
-    snapshot_registry = SnapshotRegistry(uq)
+    snapshot_registry = SnapshotRegistry(uq, None)
 
     qmc_impl = snapshot_registry._implementation(Reference('qmc'))
     assert qmc_impl.name == 'qmc_impl'
@@ -187,7 +245,7 @@ def test_implementation(uq: Configuration) -> None:
 
 def test_stateful(uq: Configuration, micro_is_stateless: bool) -> None:
     uq.implementations['macro_impl'].stateful = IState.WEAKLY_STATEFUL
-    snapshot_registry = SnapshotRegistry(uq)
+    snapshot_registry = SnapshotRegistry(uq, None)
 
     assert snapshot_registry._is_stateful(Reference('macro'))
     stateful = snapshot_registry._is_stateful(Reference('micro'))
@@ -198,7 +256,7 @@ def test_stateful(uq: Configuration, micro_is_stateless: bool) -> None:
 
 def test_macro_micro_snapshots(
         macro_micro: Configuration, micro_is_stateless: bool) -> None:
-    snapshot_registry = SnapshotRegistry(macro_micro)
+    snapshot_registry = SnapshotRegistry(macro_micro, None)
     # prevent actually writing a ymmsl file, testing that separately
     snapshot_registry._write_snapshot_ymmsl = MagicMock()
     macro = Reference('macro')
@@ -261,7 +319,7 @@ def test_macro_micro_snapshots(
 
 
 def test_uq(uq: Configuration, micro_is_stateless: bool) -> None:
-    snapshot_registry = SnapshotRegistry(uq)
+    snapshot_registry = SnapshotRegistry(uq, None)
     # prevent actually writing a ymmsl file, testing that separately
     snapshot_registry._write_snapshot_ymmsl = MagicMock()
     macro = Reference('macro')
@@ -326,7 +384,7 @@ def test_heuristic_rollbacks() -> None:
 
     comp1, comp2, comp3, comp4 = (Reference(f'comp{i}') for i in range(4))
 
-    snapshot_registry = SnapshotRegistry(config)
+    snapshot_registry = SnapshotRegistry(config, None)
     # prevent actually writing a ymmsl file, testing that separately
     snapshot_registry._write_snapshot_ymmsl = MagicMock()
 

From adfe4760885b3222474791f2193f0db0d5454993 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 13 Sep 2022 11:48:53 +0200
Subject: [PATCH 041/183] Implement submit_snapshot in MMP server

---
 libmuscle/python/libmuscle/manager/manager.py | 13 +++-
 .../python/libmuscle/manager/mmp_server.py    | 28 ++++++++-
 libmuscle/python/libmuscle/manager/run_dir.py | 18 ++++++
 .../libmuscle/manager/snapshot_registry.py    | 26 +++++---
 .../python/libmuscle/manager/test/conftest.py | 62 ++++++++++++-------
 .../manager/test/test_mmp_request_handler.py  | 30 ++++++++-
 libmuscle/python/libmuscle/mcp/protocol.py    |  1 +
 7 files changed, 139 insertions(+), 39 deletions(-)

diff --git a/libmuscle/python/libmuscle/manager/manager.py b/libmuscle/python/libmuscle/manager/manager.py
index 21f21c60..14c7e15b 100644
--- a/libmuscle/python/libmuscle/manager/manager.py
+++ b/libmuscle/python/libmuscle/manager/manager.py
@@ -11,6 +11,7 @@
 from libmuscle.manager.mmp_server import MMPServer
 from libmuscle.manager.instance_manager import InstanceManager
 from libmuscle.manager.run_dir import RunDir
+from libmuscle.manager.snapshot_registry import SnapshotRegistry
 from libmuscle.manager.topology_store import TopologyStore
 
 
@@ -42,6 +43,15 @@ def __init__(
         self._logger = Logger(log_dir, log_level)
         self._topology_store = TopologyStore(configuration)
         self._instance_registry = InstanceRegistry()
+        if run_dir is not None:
+            snapshot_dir = run_dir.snapshot_dir()
+        else:
+            snapshot_dir = Path.cwd()
+            if self._configuration.checkpoints:
+                _logger.warning('Checkpoints are configured but no run'
+                                ' directory is provided. Snapshots will be'
+                                ' stored in the current working directory.')
+        self._snapshot_registry = SnapshotRegistry(configuration, snapshot_dir)
 
         if self._run_dir:
             save_ymmsl(
@@ -59,7 +69,8 @@ def __init__(
 
         self._server = MMPServer(
                 self._logger, self._configuration,
-                self._instance_registry, self._topology_store)
+                self._instance_registry, self._topology_store,
+                self._snapshot_registry)
 
         if self._instance_manager:
             self._instance_manager.set_manager_location(
diff --git a/libmuscle/python/libmuscle/manager/mmp_server.py b/libmuscle/python/libmuscle/manager/mmp_server.py
index 793bbfc8..6f377362 100644
--- a/libmuscle/python/libmuscle/manager/mmp_server.py
+++ b/libmuscle/python/libmuscle/manager/mmp_server.py
@@ -12,10 +12,12 @@
 from libmuscle.manager.instance_registry import (
         AlreadyRegistered, InstanceRegistry)
 from libmuscle.manager.logger import Logger
+from libmuscle.manager.snapshot_registry import SnapshotRegistry
 from libmuscle.manager.topology_store import TopologyStore
 from libmuscle.mcp.protocol import RequestType, ResponseType
 from libmuscle.mcp.tcp_transport_server import TcpTransportServer
 from libmuscle.mcp.transport_server import RequestHandler
+from libmuscle.snapshot import SnapshotMetadata
 from libmuscle.timestamp import Timestamp
 from libmuscle.util import generate_indices, instance_indices
 
@@ -55,7 +57,8 @@ def __init__(
             logger: Logger,
             configuration: PartialConfiguration,
             instance_registry: InstanceRegistry,
-            topology_store: TopologyStore):
+            topology_store: TopologyStore,
+            snapshot_registry: SnapshotRegistry):
         """Create an MMPRequestHandler.
 
         Args:
@@ -68,6 +71,7 @@ def __init__(
         self._configuration = configuration
         self._instance_registry = instance_registry
         self._topology_store = topology_store
+        self._snapshot_registry = snapshot_registry
         self._reference_time = datetime.now(timezone.utc)
         self._reference_timestamp = self._reference_time.timestamp()
 
@@ -95,6 +99,8 @@ def handle_request(self, request: bytes) -> bytes:
             response = self._submit_log_message(*req_args)
         elif req_type == RequestType.SUBMIT_PROFILE_EVENTS.value:
             response = self._submit_profile_events(*req_args)
+        elif req_type == RequestType.SUBMIT_SNAPSHOT.value:
+            response = self._submit_snapshot(*req_args)
 
         return cast(bytes, msgpack.packb(response, use_bin_type=True))
 
@@ -259,6 +265,20 @@ def _submit_profile_events(self, events: List[List[Any]]) -> Any:
         """
         return [ResponseType.SUCCESS.value]
 
+    def _submit_snapshot(
+            self, instance_id: str, snapshot: Dict[str, Any]) -> Any:
+        """Handle a submit snapshot request.
+
+        Returns:
+            A list containing the following values on success:
+
+            status (ResponseType): SUCCESS
+        """
+        snapshot_obj = SnapshotMetadata(**snapshot)
+        instance = Reference(instance_id)
+        self._snapshot_registry.register_snapshot(instance, snapshot_obj)
+        return [ResponseType.SUCCESS.value]
+
     def _generate_peer_instances(
             self, instance: Reference) -> Generator[Reference, None, None]:
         """Generates the names of all peer instances of an instance.
@@ -320,7 +340,8 @@ def __init__(
             logger: Logger,
             configuration: PartialConfiguration,
             instance_registry: InstanceRegistry,
-            topology_store: TopologyStore
+            topology_store: TopologyStore,
+            snapshot_registry: SnapshotRegistry
             ) -> None:
         """Create an MMPServer.
 
@@ -338,7 +359,8 @@ def __init__(
             topology_store: To get peers and conduits from
         """
         self._handler = MMPRequestHandler(
-                logger, configuration, instance_registry, topology_store)
+                logger, configuration, instance_registry, topology_store,
+                snapshot_registry)
         try:
             self._server = TcpTransportServer(self._handler, 9000)
         except OSError as e:
diff --git a/libmuscle/python/libmuscle/manager/run_dir.py b/libmuscle/python/libmuscle/manager/run_dir.py
index 8bb4b91e..c2a50ed9 100644
--- a/libmuscle/python/libmuscle/manager/run_dir.py
+++ b/libmuscle/python/libmuscle/manager/run_dir.py
@@ -1,4 +1,5 @@
 from pathlib import Path
+from typing import Optional
 
 from ymmsl import Reference
 
@@ -20,6 +21,8 @@ class RunDir:
                 <instance_name[i]>.out
                 <instance_name[i]>.err
                 work_dir/
+                snapshots/
+        snapshots/
     """
     def __init__(self, run_dir: Path) -> None:
         """Create a RunDir managing the given directory.
@@ -57,3 +60,18 @@ def instance_dir(self, name: Reference) -> Path:
         make it.
         """
         return self.path / 'instances' / str(name)
+
+    def snapshot_dir(self, name: Optional[Reference] = None) -> Path:
+        """Return the snapshot directory for the workflow or for an instance.
+
+        Args:
+            name: Name of the instance. May be None to get the workflow snapshot
+                directory.
+
+        Returns:
+            The path to the snapshot directory
+        """
+        if name is None:
+            return self.path / 'snapshots'
+        else:
+            return self.instance_dir(name) / 'snapshots'
diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py
index 9fb53d69..b7f34253 100644
--- a/libmuscle/python/libmuscle/manager/snapshot_registry.py
+++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py
@@ -8,7 +8,7 @@
 from typing import Dict, Optional, Set, List, Tuple, TypeVar
 
 from ymmsl import (
-        Reference, Configuration, Identifier, Implementation, save,
+        Reference, Model, Identifier, Implementation, save,
         PartialConfiguration, ImplementationState as IState)
 
 from libmuscle.snapshot import SnapshotMetadata
@@ -168,20 +168,26 @@ class SnapshotRegistry:
     """
 
     def __init__(
-            self, configuration: Configuration, snapshot_folder: Path) -> None:
+            self, config: PartialConfiguration, snapshot_folder: Path
+            ) -> None:
         """Create a snapshot graph using provided configuration.
 
         Args:
-            configuration: ymmsl configuration describing the workflow.
+            config: ymmsl configuration describing the workflow.
         """
-        self._configuration = configuration
+        if config.model is None or not isinstance(config.model, Model):
+            raise ValueError('The yMMSL experiment description does not'
+                             ' contain a (complete) model section, so there'
+                             ' is nothing to run!')
+        self._configuration = config
+        self._model = config.model
         self._snapshot_folder = snapshot_folder
 
         self._snapshots = {}                # type: _SnapshotDictType
 
         self._instances = set()             # type: Set[Reference]
         self._stateful_instances = set()    # type: Set[Reference]
-        for component in configuration.model.components:
+        for component in config.model.components:
             instances = set(component.instances())
             self._instances.update(instances)
             if self._is_stateful(component.name):
@@ -389,7 +395,7 @@ def _generate_description(
         component_table += [
                 f'{name.ljust(max_instance_len)} {timestamp} {walltime}'
                 for name, timestamp, walltime in component_info]
-        return (f'Workflow snapshot for {self._configuration.model.name}'
+        return (f'Workflow snapshot for {self._model.name}'
                 f' taken on {now.strftime("%Y-%m-%d %H:%M:%S")}.\n'
                 'Snapshot triggers:\n' +
                 '\n'.join(f'- {trigger} ({", ".join(triggers[trigger])})'
@@ -440,7 +446,7 @@ def _get_stateful_peers(self, instance: Reference) -> Set[Reference]:
         peers = set()  # type: Set[Reference]
         kernel = instance.without_trailing_ints()
         index = [int(instance[i]) for i in range(len(kernel), len(instance))]
-        for conduit in self._configuration.model.conduits:
+        for conduit in self._model.conduits:
             if conduit.sending_component() == kernel:
                 peer_kernel = conduit.receiving_component()
             elif conduit.receiving_component() == kernel:
@@ -490,7 +496,7 @@ def _get_connections(self, instance: Reference, peer: Reference
         peer_kernel = peer.without_trailing_ints()
 
         connected_ports = []  # type: List[_ConnectionType]
-        for conduit in self._configuration.model.conduits:
+        for conduit in self._model.conduits:
             if (conduit.sending_component() == instance_kernel and
                     conduit.receiving_component() == peer_kernel):
                 conn_type = _ConnectionInfo.SELF_IS_SENDING
@@ -523,7 +529,7 @@ def _get_connections(self, instance: Reference, peer: Reference
     def _multiplicity(self, kernel: Reference) -> List[int]:
         """Return the multiplicity of a kernel
         """
-        for component in self._configuration.model.components:
+        for component in self._model.components:
             if component.name == kernel:
                 return component.multiplicity
         raise KeyError(str(kernel))
@@ -540,7 +546,7 @@ def _implementation(self, kernel: Reference) -> Optional[Implementation]:
             configuration.
         """
         implementation = None
-        for component in self._configuration.model.components:
+        for component in self._model.components:
             if component.name == kernel:
                 implementation = component.implementation
         if implementation in self._configuration.implementations:
diff --git a/libmuscle/python/libmuscle/manager/test/conftest.py b/libmuscle/python/libmuscle/manager/test/conftest.py
index 433e23b1..e95f290c 100644
--- a/libmuscle/python/libmuscle/manager/test/conftest.py
+++ b/libmuscle/python/libmuscle/manager/test/conftest.py
@@ -1,12 +1,12 @@
 from pathlib import Path
 
 import pytest
-from ymmsl import (Component, Conduit, Configuration, Model, Reference,
-                   PartialConfiguration)
+from ymmsl import Component, Conduit, Configuration, Model, Reference
 
 from libmuscle.manager.instance_registry import InstanceRegistry
 from libmuscle.manager.logger import Logger
 from libmuscle.manager.mmp_server import MMPRequestHandler
+from libmuscle.manager.snapshot_registry import SnapshotRegistry
 from libmuscle.manager.topology_store import TopologyStore
 
 
@@ -19,17 +19,7 @@ def logger(tmpdir):
 
 @pytest.fixture
 def mmp_configuration():
-    return PartialConfiguration()
-
-
-@pytest.fixture
-def instance_registry():
-    return InstanceRegistry()
-
-
-@pytest.fixture
-def topology_store() -> TopologyStore:
-    config = Configuration(
+    return Configuration(
             Model(
                 'test_model',
                 [
@@ -41,14 +31,29 @@ def topology_store() -> TopologyStore:
                     Conduit('micro.out', 'macro.in')
                 ]))
 
-    return TopologyStore(config)
+
+@pytest.fixture
+def instance_registry():
+    return InstanceRegistry()
+
+
+@pytest.fixture
+def topology_store(mmp_configuration) -> TopologyStore:
+    return TopologyStore(mmp_configuration)
+
+
+@pytest.fixture
+def snapshot_registry(mmp_configuration) -> SnapshotRegistry:
+    return SnapshotRegistry(mmp_configuration, None)
 
 
 @pytest.fixture
 def mmp_request_handler(
-        logger, mmp_configuration, instance_registry, topology_store):
+        logger, mmp_configuration, instance_registry, topology_store,
+        snapshot_registry):
     return MMPRequestHandler(
-            logger, mmp_configuration, instance_registry, topology_store)
+            logger, mmp_configuration, instance_registry, topology_store,
+            snapshot_registry)
 
 
 @pytest.fixture
@@ -64,14 +69,16 @@ def loaded_instance_registry(instance_registry):
 
 @pytest.fixture
 def registered_mmp_request_handler(
-        logger, mmp_configuration, loaded_instance_registry, topology_store):
+        logger, mmp_configuration, loaded_instance_registry, topology_store,
+        snapshot_registry):
     return MMPRequestHandler(
-            logger, mmp_configuration, loaded_instance_registry, topology_store)
+            logger, mmp_configuration, loaded_instance_registry, topology_store,
+            snapshot_registry)
 
 
 @pytest.fixture
-def topology_store2() -> TopologyStore:
-    config = Configuration(
+def mmp_configuration2():
+    return Configuration(
             Model(
                 'test_model',
                 [
@@ -86,7 +93,15 @@ def topology_store2() -> TopologyStore:
                     Conduit('meso.out', 'macro.in')
                 ]))
 
-    return TopologyStore(config)
+
+@pytest.fixture
+def topology_store2(mmp_configuration2) -> TopologyStore:
+    return TopologyStore(mmp_configuration2)
+
+
+@pytest.fixture
+def snapshot_registry2(mmp_configuration2) -> SnapshotRegistry:
+    return SnapshotRegistry(mmp_configuration2, None)
 
 
 @pytest.fixture
@@ -110,7 +125,8 @@ def loaded_instance_registry2():
 
 @pytest.fixture
 def registered_mmp_request_handler2(
-        logger, mmp_configuration, loaded_instance_registry2, topology_store2):
+        logger, mmp_configuration, loaded_instance_registry2, topology_store2,
+        snapshot_registry2):
     return MMPRequestHandler(
             logger, mmp_configuration,
-            loaded_instance_registry2, topology_store2)
+            loaded_instance_registry2, topology_store2, snapshot_registry2)
diff --git a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
index 0d91c650..ac80dca2 100644
--- a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
+++ b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
@@ -1,5 +1,8 @@
+import dataclasses
 from datetime import datetime, timezone
 from pathlib import Path
+from unittest.mock import MagicMock
+
 import msgpack
 from ymmsl import (
         Operator, Reference, Checkpoints, CheckpointRangeRule, CheckpointAtRule)
@@ -7,12 +10,14 @@
 from libmuscle.logging import LogLevel
 from libmuscle.manager.mmp_server import MMPRequestHandler
 from libmuscle.mcp.protocol import RequestType, ResponseType
+from libmuscle.snapshot import SnapshotMetadata
 
 
 def test_create_servicer(logger, mmp_configuration, instance_registry,
-                         topology_store):
+                         topology_store, snapshot_registry):
     MMPRequestHandler(
-            logger, mmp_configuration, instance_registry, topology_store)
+            logger, mmp_configuration, instance_registry, topology_store,
+            snapshot_registry)
 
 
 def test_log_message(mmp_request_handler, caplog):
@@ -267,3 +272,24 @@ def test_request_peers_unknown(registered_mmp_request_handler2):
     assert status == ResponseType.ERROR.value
     assert error_msg is not None
     assert 'does_not_exist' in error_msg
+
+
+def test_submit_snapshot(registered_mmp_request_handler):
+    register_snapshot = MagicMock()
+    registered_mmp_request_handler._snapshot_registry.register_snapshot = \
+        register_snapshot
+
+    instance_id = 'micro[1][2]'
+    snapshot = SnapshotMetadata(
+            ['1', '2'], 1.234, 2.345, 3.456,
+            {'in': [1], 'out': [0]}, True, 'fname')
+    snapshot_dict = dataclasses.asdict(snapshot)
+
+    request = [RequestType.SUBMIT_SNAPSHOT.value, instance_id, snapshot_dict]
+    encoded_request = msgpack.packb(request, use_bin_type=True)
+
+    result = registered_mmp_request_handler.handle_request(encoded_request)
+    decoded_result = msgpack.unpackb(result, raw=False)
+
+    assert decoded_result[0] == ResponseType.SUCCESS.value
+    register_snapshot.assert_called_once_with(Reference(instance_id), snapshot)
diff --git a/libmuscle/python/libmuscle/mcp/protocol.py b/libmuscle/python/libmuscle/mcp/protocol.py
index 1e79a11d..06d1c0da 100644
--- a/libmuscle/python/libmuscle/mcp/protocol.py
+++ b/libmuscle/python/libmuscle/mcp/protocol.py
@@ -20,6 +20,7 @@ class RequestType(Enum):
     GET_SETTINGS = 4
     SUBMIT_LOG_MESSAGE = 5
     SUBMIT_PROFILE_EVENTS = 6
+    SUBMIT_SNAPSHOT = 7
 
     # MUSCLE Peer Protocol
     GET_NEXT_MESSAGE = 21

From d4c19ddabc14ce09c5f9dea3a372a14f14b84019 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 13 Sep 2022 11:57:03 +0200
Subject: [PATCH 042/183] Implement submit_snapshot in MMP client

---
 libmuscle/python/libmuscle/mmp_client.py       | 18 +++++++++++++++---
 libmuscle/python/libmuscle/snapshot_manager.py |  2 +-
 .../libmuscle/test/test_snapshot_manager.py    |  6 ++++--
 3 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py
index 6376aa20..6a3fe729 100644
--- a/libmuscle/python/libmuscle/mmp_client.py
+++ b/libmuscle/python/libmuscle/mmp_client.py
@@ -1,3 +1,4 @@
+import dataclasses
 from datetime import datetime, timezone
 from pathlib import Path
 from random import uniform
@@ -135,9 +136,20 @@ def submit_profile_events(self, events: Iterable[ProfileEvent]) -> None:
                 [encode_profile_event(e) for e in events]]
         self._call_manager(request)
 
-    def submit_snapshot_metadata(self, snapshot_metadata: SnapshotMetadata
-                                 ) -> None:
-        ...  # TODO
+    def submit_snapshot_metadata(
+                self, name: Reference, snapshot_metadata: SnapshotMetadata
+                ) -> None:
+        """Send snapshot metadata to the manager.
+
+        Args:
+            name: Name of the instance in the simulation.
+            snapshot_metadata: Snapshot metadata to supply to the manager.
+        """
+        request = [
+                RequestType.SUBMIT_SNAPSHOT.value,
+                str(name),
+                dataclasses.asdict(snapshot_metadata)]
+        self._call_manager(request)
 
     def get_settings(self) -> Settings:
         """Get the central settings from the manager.
diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py
index cd6f9959..10f2c9fc 100644
--- a/libmuscle/python/libmuscle/snapshot_manager.py
+++ b/libmuscle/python/libmuscle/snapshot_manager.py
@@ -154,7 +154,7 @@ def __save_snapshot(self, msg: Message, final: bool) -> None:
 
         path = self.__store_snapshot(snapshot)
         metadata = SnapshotMetadata.from_snapshot(snapshot, str(path))
-        self._manager.submit_snapshot_metadata(metadata)
+        self._manager.submit_snapshot_metadata(self._instance_id, metadata)
 
         if self._trigger is not None:
             self._trigger.update_checkpoints(
diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
index d7d386c9..f1c18ec8 100644
--- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py
+++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
@@ -56,7 +56,8 @@ def test_save_load_checkpoint(tmp_path: Path) -> None:
 
     communicator.get_message_counts.assert_called_with()
     manager.submit_snapshot_metadata.assert_called()
-    metadata = manager.submit_snapshot_metadata.call_args[0][0]
+    instance, metadata = manager.submit_snapshot_metadata.call_args[0]
+    assert instance == instance_id
     assert isinstance(metadata, SnapshotMetadata)
     assert metadata.triggers
     assert metadata.wallclock_time > 0.0
@@ -86,7 +87,8 @@ def test_save_load_checkpoint(tmp_path: Path) -> None:
     assert snapshot_manager2.should_save_final_snapshot(0.6)
     snapshot_manager2.save_final_snapshot(Message(0.6, None, 'test data2'))
 
-    metadata = manager.submit_snapshot_metadata.call_args[0][0]
+    instance, metadata = manager.submit_snapshot_metadata.call_args[0]
+    assert instance == instance_id
     assert isinstance(metadata, SnapshotMetadata)
     assert metadata.triggers
     assert metadata.wallclock_time > 0.0

From 4fd2b530a4698aba9d5d8552b4871b8cf8201bb4 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 13 Sep 2022 15:36:45 +0200
Subject: [PATCH 043/183] Use builtin itertools.product for generate_indices

---
 libmuscle/python/libmuscle/util.py | 29 +++--------------------------
 1 file changed, 3 insertions(+), 26 deletions(-)

diff --git a/libmuscle/python/libmuscle/util.py b/libmuscle/python/libmuscle/util.py
index 66217c21..f34d8cd9 100644
--- a/libmuscle/python/libmuscle/util.py
+++ b/libmuscle/python/libmuscle/util.py
@@ -1,3 +1,4 @@
+import itertools
 from pathlib import Path
 import sys
 from typing import Generator, List, Optional, cast
@@ -47,32 +48,8 @@ def generate_indices(dims: List[int]) -> Generator[List[int], None, None]:
     Yields:
         Lists of indices, one for each point in the block.
     """
-    index = [0] * len(dims)
-    done = False
-    while not done:
-        yield index
-        done = increment_index(index, dims)
-
-
-def increment_index(index: List[int], dims: List[int]) -> bool:
-    """Increments an index.
-
-    Args:
-        index: The index to be incremented.
-        dims: The dimensions of the block this index is in.
-
-    Returns:
-        True iff the index overflowed and is now all zeros again.
-    """
-    cur = len(index) - 1
-    index[cur] += 1
-    while index[cur] == dims[cur]:
-        index[cur] = 0
-        if cur == 0:
-            return True
-        cur -= 1
-        index[cur] += 1
-    return False
+    for index in itertools.product(*map(range, dims)):
+        yield list(index)
 
 
 def extract_log_file_location(filename: str) -> Optional[Path]:

From 401a27594d570ad63bbb86ae0c26d137841e014a Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 13 Sep 2022 15:38:36 +0200
Subject: [PATCH 044/183] Refactor generate_peer_instances

Now reused across mmp_server and snapshot_registry
---
 libmuscle/python/libmuscle/manager/manager.py |  3 +-
 .../python/libmuscle/manager/mmp_server.py    | 31 ++-----------
 .../libmuscle/manager/snapshot_registry.py    | 43 ++++---------------
 .../python/libmuscle/manager/test/conftest.py |  8 ++--
 .../manager/test/test_snapshot_registry.py    | 32 ++++++--------
 .../libmuscle/manager/topology_store.py       | 28 ++++++++++++
 6 files changed, 59 insertions(+), 86 deletions(-)

diff --git a/libmuscle/python/libmuscle/manager/manager.py b/libmuscle/python/libmuscle/manager/manager.py
index 14c7e15b..c28e65fb 100644
--- a/libmuscle/python/libmuscle/manager/manager.py
+++ b/libmuscle/python/libmuscle/manager/manager.py
@@ -51,7 +51,8 @@ def __init__(
                 _logger.warning('Checkpoints are configured but no run'
                                 ' directory is provided. Snapshots will be'
                                 ' stored in the current working directory.')
-        self._snapshot_registry = SnapshotRegistry(configuration, snapshot_dir)
+        self._snapshot_registry = SnapshotRegistry(
+                configuration, snapshot_dir, self._topology_store)
 
         if self._run_dir:
             save_ymmsl(
diff --git a/libmuscle/python/libmuscle/manager/mmp_server.py b/libmuscle/python/libmuscle/manager/mmp_server.py
index 6f377362..f5b8b692 100644
--- a/libmuscle/python/libmuscle/manager/mmp_server.py
+++ b/libmuscle/python/libmuscle/manager/mmp_server.py
@@ -1,7 +1,7 @@
 from datetime import datetime, timezone
 import errno
 import logging
-from typing import Any, Dict, Optional, Tuple, cast, Generator, List
+from typing import Any, Dict, Optional, Tuple, cast, List
 
 import msgpack
 from ymmsl import (
@@ -19,7 +19,6 @@
 from libmuscle.mcp.transport_server import RequestHandler
 from libmuscle.snapshot import SnapshotMetadata
 from libmuscle.timestamp import Timestamp
-from libmuscle.util import generate_indices, instance_indices
 
 
 _logger = logging.getLogger(__name__)
@@ -182,9 +181,10 @@ def _get_peers(self, instance_id: str) -> Any:
 
         # generate instances
         try:
+            peers = self._topology_store.get_peer_instances(instance)
             instance_locations = {
                     str(peer): self._instance_registry.get_locations(peer)
-                    for peer in self._generate_peer_instances(instance)}
+                    for peer in peers}
         except KeyError as e:
             return [
                     ResponseType.PENDING.value,
@@ -279,31 +279,6 @@ def _submit_snapshot(
         self._snapshot_registry.register_snapshot(instance, snapshot_obj)
         return [ResponseType.SUCCESS.value]
 
-    def _generate_peer_instances(
-            self, instance: Reference) -> Generator[Reference, None, None]:
-        """Generates the names of all peer instances of an instance.
-
-        Args:
-            instance: The instance whose peers to generate.
-
-        Yields:
-            All peer instance identifiers.
-        """
-        component = instance.without_trailing_ints()
-        indices = instance_indices(instance)
-        dims = self._topology_store.kernel_dimensions[component]
-        all_peer_dims = self._topology_store.get_peer_dimensions(component)
-        for peer, peer_dims in all_peer_dims.items():
-            base = peer
-            for i in range(min(len(dims), len(peer_dims))):
-                base += indices[i]
-
-            if dims >= peer_dims:
-                yield base
-            else:
-                for peer_indices in generate_indices(peer_dims[len(dims):]):
-                    yield base + peer_indices
-
     def _get_checkpoint_info(
                 self,
                 instance: Reference
diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py
index b7f34253..6e7f1199 100644
--- a/libmuscle/python/libmuscle/manager/snapshot_registry.py
+++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py
@@ -6,6 +6,7 @@
 from operator import attrgetter
 from pathlib import Path
 from typing import Dict, Optional, Set, List, Tuple, TypeVar
+from libmuscle.manager.topology_store import TopologyStore
 
 from ymmsl import (
         Reference, Model, Identifier, Implementation, save,
@@ -168,8 +169,8 @@ class SnapshotRegistry:
     """
 
     def __init__(
-            self, config: PartialConfiguration, snapshot_folder: Path
-            ) -> None:
+            self, config: PartialConfiguration, snapshot_folder: Path,
+            topology_store: TopologyStore) -> None:
         """Create a snapshot graph using provided configuration.
 
         Args:
@@ -182,6 +183,7 @@ def __init__(
         self._configuration = config
         self._model = config.model
         self._snapshot_folder = snapshot_folder
+        self._topology_store = topology_store
 
         self._snapshots = {}                # type: _SnapshotDictType
 
@@ -443,30 +445,10 @@ def _get_stateful_peers(self, instance: Reference) -> Set[Reference]:
         Returns:
             Set with all stateful peer instances (including their index).
         """
-        peers = set()  # type: Set[Reference]
-        kernel = instance.without_trailing_ints()
-        index = [int(instance[i]) for i in range(len(kernel), len(instance))]
-        for conduit in self._model.conduits:
-            if conduit.sending_component() == kernel:
-                peer_kernel = conduit.receiving_component()
-            elif conduit.receiving_component() == kernel:
-                peer_kernel = conduit.sending_component()
-            else:
-                continue
-            if not self._is_stateful(peer_kernel):
-                continue
-            if len(index) == len(self._multiplicity(peer_kernel)):
-                # we must be sending to the peer with the same index as us
-                peers.add(peer_kernel + index)
-            elif len(index) + 1 == len(self._multiplicity(peer_kernel)):
-                # we are sending on a vector port, peer is receiving non-vector
-                # generate all peer indices
-                for i in range(self._multiplicity(peer_kernel)[-1]):
-                    peers.add(peer_kernel + index + i)
-            elif len(index) - 1 == len(self._multiplicity(peer_kernel)):
-                # we are sending to a vector port, strip last of our indices
-                peers.add(peer_kernel + index[:-1])
-        return peers
+        return set(
+                peer
+                for peer in self._topology_store.get_peer_instances(instance)
+                if self._is_stateful(peer.without_trailing_ints()))
 
     @lru_cache(maxsize=None)
     def _get_connections(self, instance: Reference, peer: Reference
@@ -525,15 +507,6 @@ def _get_connections(self, instance: Reference, peer: Reference
                         conn_type))
         return connected_ports
 
-    @lru_cache(maxsize=None)
-    def _multiplicity(self, kernel: Reference) -> List[int]:
-        """Return the multiplicity of a kernel
-        """
-        for component in self._model.components:
-            if component.name == kernel:
-                return component.multiplicity
-        raise KeyError(str(kernel))
-
     @lru_cache(maxsize=None)
     def _implementation(self, kernel: Reference) -> Optional[Implementation]:
         """Return the implementation of a kernel.
diff --git a/libmuscle/python/libmuscle/manager/test/conftest.py b/libmuscle/python/libmuscle/manager/test/conftest.py
index e95f290c..24772bda 100644
--- a/libmuscle/python/libmuscle/manager/test/conftest.py
+++ b/libmuscle/python/libmuscle/manager/test/conftest.py
@@ -43,8 +43,8 @@ def topology_store(mmp_configuration) -> TopologyStore:
 
 
 @pytest.fixture
-def snapshot_registry(mmp_configuration) -> SnapshotRegistry:
-    return SnapshotRegistry(mmp_configuration, None)
+def snapshot_registry(mmp_configuration, topology_store) -> SnapshotRegistry:
+    return SnapshotRegistry(mmp_configuration, None, topology_store)
 
 
 @pytest.fixture
@@ -100,8 +100,8 @@ def topology_store2(mmp_configuration2) -> TopologyStore:
 
 
 @pytest.fixture
-def snapshot_registry2(mmp_configuration2) -> SnapshotRegistry:
-    return SnapshotRegistry(mmp_configuration2, None)
+def snapshot_registry2(mmp_configuration2, topology_store) -> SnapshotRegistry:
+    return SnapshotRegistry(mmp_configuration2, None, topology_store)
 
 
 @pytest.fixture
diff --git a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py
index a7f48656..cc713c6a 100644
--- a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py
+++ b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py
@@ -3,7 +3,6 @@
 from unittest.mock import MagicMock
 
 import pytest
-from libmuscle.snapshot import SnapshotMetadata
 from ymmsl import (
         Configuration, Model, Component, Conduit, Implementation,
         ImplementationState as IState, Reference)
@@ -11,6 +10,8 @@
 from libmuscle.manager.snapshot_registry import (
     SnapshotNode, SnapshotRegistry, calc_consistency, calc_consistency_list, safe_get,
     _ConnectionInfo)
+from libmuscle.manager.topology_store import TopologyStore
+from libmuscle.snapshot import SnapshotMetadata
 
 
 def make_snapshot(**msg_counts) -> SnapshotMetadata:
@@ -100,8 +101,9 @@ def test_calc_consistency_list() -> None:
 
 
 def test_write_ymmsl(tmp_path: Path):
+    configuration = Configuration(Model('empty', []))
     snapshot_registry = SnapshotRegistry(
-            Configuration(Model('empty', [])), tmp_path)
+            configuration, tmp_path, TopologyStore(configuration))
     snapshot_registry._write_snapshot_ymmsl([])
 
     paths = list(tmp_path.iterdir())
@@ -121,8 +123,9 @@ def test_write_ymmsl(tmp_path: Path):
 
 
 def test_snapshot_config():
+    configuration = Configuration(Model('empty', []))
     snapshot_registry = SnapshotRegistry(
-            Configuration(Model('empty', [])), None)
+            configuration, None, TopologyStore(configuration))
     micro_metadata = SnapshotMetadata(
             ['simulation_time >= 24.0', 'wallclocktime >= 10'],
             10.123456789, 24.3456789, None, {}, False, 'micro_snapshot')
@@ -156,7 +159,7 @@ def test_snapshot_config():
 
 
 def test_stateful_peers(uq: Configuration, micro_is_stateless: bool) -> None:
-    snapshot_registry = SnapshotRegistry(uq, None)
+    snapshot_registry = SnapshotRegistry(uq, None, TopologyStore(uq))
     macro = Reference('macro')
     micro = Reference('micro')
     qmc = Reference('qmc')
@@ -177,7 +180,7 @@ def test_stateful_peers(uq: Configuration, micro_is_stateless: bool) -> None:
 
 
 def test_connections(uq: Configuration) -> None:
-    snapshot_registry = SnapshotRegistry(uq, None)
+    snapshot_registry = SnapshotRegistry(uq, None, TopologyStore(uq))
     macro = Reference('macro')
     micro = Reference('micro')
     qmc = Reference('qmc')
@@ -225,16 +228,8 @@ def test_connections(uq: Configuration) -> None:
         assert not (info & _ConnectionInfo.PEER_IS_VECTOR)
 
 
-def test_multiplicity(uq: Configuration) -> None:
-    snapshot_registry = SnapshotRegistry(uq, None)
-    assert snapshot_registry._multiplicity(Reference('qmc')) == []
-    assert snapshot_registry._multiplicity(Reference('rr')) == []
-    assert snapshot_registry._multiplicity(Reference('macro')) == [5]
-    assert snapshot_registry._multiplicity(Reference('micro')) == [5]
-
-
 def test_implementation(uq: Configuration) -> None:
-    snapshot_registry = SnapshotRegistry(uq, None)
+    snapshot_registry = SnapshotRegistry(uq, None, TopologyStore(uq))
 
     qmc_impl = snapshot_registry._implementation(Reference('qmc'))
     assert qmc_impl.name == 'qmc_impl'
@@ -245,7 +240,7 @@ def test_implementation(uq: Configuration) -> None:
 
 def test_stateful(uq: Configuration, micro_is_stateless: bool) -> None:
     uq.implementations['macro_impl'].stateful = IState.WEAKLY_STATEFUL
-    snapshot_registry = SnapshotRegistry(uq, None)
+    snapshot_registry = SnapshotRegistry(uq, None, TopologyStore(uq))
 
     assert snapshot_registry._is_stateful(Reference('macro'))
     stateful = snapshot_registry._is_stateful(Reference('micro'))
@@ -256,7 +251,8 @@ def test_stateful(uq: Configuration, micro_is_stateless: bool) -> None:
 
 def test_macro_micro_snapshots(
         macro_micro: Configuration, micro_is_stateless: bool) -> None:
-    snapshot_registry = SnapshotRegistry(macro_micro, None)
+    snapshot_registry = SnapshotRegistry(
+            macro_micro, None, TopologyStore(macro_micro))
     # prevent actually writing a ymmsl file, testing that separately
     snapshot_registry._write_snapshot_ymmsl = MagicMock()
     macro = Reference('macro')
@@ -319,7 +315,7 @@ def test_macro_micro_snapshots(
 
 
 def test_uq(uq: Configuration, micro_is_stateless: bool) -> None:
-    snapshot_registry = SnapshotRegistry(uq, None)
+    snapshot_registry = SnapshotRegistry(uq, None, TopologyStore(uq))
     # prevent actually writing a ymmsl file, testing that separately
     snapshot_registry._write_snapshot_ymmsl = MagicMock()
     macro = Reference('macro')
@@ -384,7 +380,7 @@ def test_heuristic_rollbacks() -> None:
 
     comp1, comp2, comp3, comp4 = (Reference(f'comp{i}') for i in range(4))
 
-    snapshot_registry = SnapshotRegistry(config, None)
+    snapshot_registry = SnapshotRegistry(config, None, TopologyStore(config))
     # prevent actually writing a ymmsl file, testing that separately
     snapshot_registry._write_snapshot_ymmsl = MagicMock()
 
diff --git a/libmuscle/python/libmuscle/manager/topology_store.py b/libmuscle/python/libmuscle/manager/topology_store.py
index c8e3f864..f3dd1fd3 100644
--- a/libmuscle/python/libmuscle/manager/topology_store.py
+++ b/libmuscle/python/libmuscle/manager/topology_store.py
@@ -1,4 +1,5 @@
 from typing import Dict, List
+from libmuscle.util import generate_indices, instance_indices
 
 from ymmsl import Conduit, PartialConfiguration, Model, Reference
 
@@ -77,3 +78,30 @@ def get_peer_dimensions(self, kernel_name: Reference
                 snd = conduit.sending_component()
                 ret[snd] = self.kernel_dimensions[snd]
         return ret
+
+    def get_peer_instances(self, instance: Reference) -> List[Reference]:
+        """Generates the names of all peer instances of an instance.
+
+        Args:
+            instance: The instance whose peers to generate.
+
+        Returns:
+            All peer instance identifiers.
+        """
+        component = instance.without_trailing_ints()
+        indices = instance_indices(instance)
+        dims = self.kernel_dimensions[component]
+        all_peer_dims = self.get_peer_dimensions(component)
+
+        peers = []
+        for peer, peer_dims in all_peer_dims.items():
+            base = peer
+            for i in range(min(len(dims), len(peer_dims))):
+                base += indices[i]
+
+            if dims >= peer_dims:
+                peers.append(base)
+            else:
+                for peer_indices in generate_indices(peer_dims[len(dims):]):
+                    peers.append(base + peer_indices)
+        return peers

From e0d1c4a1edd90395958fd44c6d57384c83f1018f Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 13 Sep 2022 16:19:06 +0200
Subject: [PATCH 045/183] Placeholder: snapshot directory setting

---
 libmuscle/python/libmuscle/manager/manager.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/libmuscle/python/libmuscle/manager/manager.py b/libmuscle/python/libmuscle/manager/manager.py
index c28e65fb..d25d1977 100644
--- a/libmuscle/python/libmuscle/manager/manager.py
+++ b/libmuscle/python/libmuscle/manager/manager.py
@@ -59,6 +59,11 @@ def __init__(
                     self._configuration,
                     self._run_dir.path / 'configuration.ymmsl')
 
+        # TODO: decide if this should be a setting or part of checkpoint_info
+        # TODO: separate folder per intance
+        self._configuration.settings.setdefault(
+                'muscle_snapshot_directory', str(snapshot_dir))
+
         self._instance_manager = None    # type: Optional[InstanceManager]
         try:
             configuration = self._configuration.as_configuration()

From 0475d852dfe1e88af34a980a9adeceea0e4a473e Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Wed, 14 Sep 2022 12:50:55 +0200
Subject: [PATCH 046/183] Integration test for macro/micro snapshot & resume

---
 integration_test/test_snapshot_macro_micro.py | 163 ++++++++++++++++++
 .../python/libmuscle/checkpoint_triggers.py   |   1 +
 libmuscle/python/libmuscle/instance.py        |   6 +-
 libmuscle/python/libmuscle/manager/run_dir.py |   6 +-
 4 files changed, 172 insertions(+), 4 deletions(-)
 create mode 100644 integration_test/test_snapshot_macro_micro.py

diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py
new file mode 100644
index 00000000..bbfd42db
--- /dev/null
+++ b/integration_test/test_snapshot_macro_micro.py
@@ -0,0 +1,163 @@
+import sys
+import pytest
+from ymmsl import Operator, load
+
+from libmuscle import Instance, Message
+from libmuscle.manager.manager import Manager
+from libmuscle.manager.run_dir import RunDir
+
+
+def macro():
+    instance = Instance({
+            Operator.O_I: ['o_i'],
+            Operator.S: ['s']})
+
+    while instance.reuse_instance():
+        t_cur = instance.get_setting('t0', 'float')
+        dt = instance.get_setting('dt', 'float')
+        t_max = instance.get_setting('t_max', 'float')
+
+        if instance.resuming():
+            msg = instance.load_snapshot()
+            t_cur = msg.timestamp
+            assert msg.next_timestamp == pytest.approx(t_cur + dt)
+            i = msg.data
+            assert i >= 0
+        else:
+            i = 0
+
+        while t_cur + dt <= t_max:
+            t_next = t_cur + dt
+
+            if instance.should_save_snapshot(t_cur, t_next):
+                instance.save_snapshot(Message(t_cur, t_next, i))
+
+            t_next = None if t_next + dt > t_max else t_next
+            instance.send('o_i', Message(t_cur, t_next, i))
+
+            msg = instance.receive('s')
+            assert msg.data == i
+
+            i += 1
+            t_cur += dt
+
+        if instance.should_save_final_snapshot(t_cur):
+            instance.save_final_snapshot(Message(t_cur, None, i))
+
+
+def micro():
+    instance = Instance({
+            Operator.F_INIT: ['f_i'],
+            Operator.O_F: ['o_f']})
+
+    while instance.reuse_instance():
+        dt = instance.get_setting('dt', 'float')
+        t_max = instance.get_setting('t_max', 'float')
+
+        if instance.resuming():
+            msg = instance.load_snapshot()
+            t_cur = msg.timestamp
+            i, t_stop = msg.data
+        else:
+            msg = instance.receive('f_i')
+            t_cur = msg.timestamp
+            i = msg.data
+            t_stop = t_cur + t_max
+
+        while t_cur < t_stop:
+            t_next = t_cur + dt
+
+            if instance.should_save_snapshot(t_cur, t_next):
+                instance.save_snapshot(Message(t_cur, t_next, [i, t_stop]))
+
+            t_cur += dt
+
+        if instance.should_save_final_snapshot(t_cur):
+            instance.save_final_snapshot(Message(t_cur, None, [i, t_stop]))
+
+        instance.send('o_f', Message(t_cur, None, i))
+
+
+def test_snapshot_macro_micro(tmp_path):
+    ymmsl_text = f"""ymmsl_version: v0.1
+model:
+  name: test_snapshot
+  components:
+    macro: macro_implementation
+    micro: micro_implementation
+  conduits:
+    macro.o_i: micro.f_i
+    micro.o_f: macro.s
+settings:
+  macro.t0: 0.12
+  macro.dt: 0.17
+  macro.t_max: 1.9
+  micro.dt: 0.009
+  micro.t_max: 0.1
+  muscle_remote_log_level: DEBUG
+implementations:
+  macro_implementation:
+    executable: {sys.executable}
+    args:
+    - {__file__}
+    - macro
+    supports_checkpoint: true
+  micro_implementation:
+    executable: {sys.executable}
+    args:
+    - {__file__}
+    - micro
+    supports_checkpoint: true
+resources:
+  macro:
+    threads: 1
+  micro:
+    threads: 1
+checkpoints:
+  simulation_time:
+  - every: 0.4"""
+    ymmsl_doc = load(ymmsl_text)
+
+    run_dir1 = RunDir(tmp_path / 'run1')
+    manager = Manager(ymmsl_doc, run_dir1)
+    manager.start_instances()
+    assert manager.wait()
+
+    # Note: sorted only works because we have fewer than 10 snapshots, otherwise
+    # _10 would be sorted right after _1
+    macro_snapshots = sorted(run_dir1.snapshot_dir().glob('macro*'))
+    assert len(macro_snapshots) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
+    micro_snapshots = sorted(run_dir1.snapshot_dir().glob('micro*'))
+    assert len(micro_snapshots) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
+    snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl'))
+    snapshot_docs = list(map(load, snapshots_ymmsl))
+    assert snapshot_docs[0].resume['macro'] == macro_snapshots[0]
+    assert snapshot_docs[0].resume['micro'] == micro_snapshots[0]
+    assert snapshot_docs[1].resume['macro'] == macro_snapshots[1]
+    assert snapshot_docs[1].resume['micro'] == micro_snapshots[0]
+    for i in range(2, 7):
+        assert snapshot_docs[i].resume['macro'] == macro_snapshots[i - 1]
+        assert snapshot_docs[i].resume['micro'] == micro_snapshots[i - 1]
+
+    ymmsl_doc.update(snapshot_docs[4])
+    del ymmsl_doc.settings['muscle_snapshot_directory']
+    run_dir2 = RunDir(tmp_path / 'run2')
+    manager = Manager(ymmsl_doc, run_dir2)
+    manager.start_instances()
+    assert manager.wait()
+
+    macro_snapshots = sorted(run_dir2.snapshot_dir().glob('macro*'))
+    assert len(macro_snapshots) == 2  # 1.6, final
+    micro_snapshots = sorted(run_dir2.snapshot_dir().glob('micro*'))
+    assert len(micro_snapshots) == 3  # 1.2, 1.6, final
+    snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl'))
+    assert len(snapshots_ymmsl) == 2
+
+
+if __name__ == "__main__":
+    if 'macro' in sys.argv:
+        macro()
+    elif 'micro' in sys.argv:
+        micro()
+    else:
+        raise RuntimeError('Specify macro or micro on the command line')
diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py
index 6e4d644e..88d47553 100644
--- a/libmuscle/python/libmuscle/checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/checkpoint_triggers.py
@@ -202,6 +202,7 @@ def __init__(self, utc_reference: datetime, checkpoints: Checkpoints
 
         self._last_triggers = []    # type: List[str]
         self._first_reuse = True
+        self._max_f_init_next_timestamp = None  # type: Optional[float]
 
         # These attributes are only used to check if implementations are
         # following the guidelines
diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index f91bae55..8c358665 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -134,7 +134,9 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool:
         # TODO: _f_init_cache should be empty here, or the user didn't
         # receive something that was sent on the last go-around.
         # At least emit a warning.
-        self.__pre_receive_f_init(apply_overlay)
+        if not (self.resuming() and self._first_run):
+            # when resuming we skip receiving on f_init in the first run
+            self.__pre_receive_f_init(apply_overlay)
 
         self._set_local_log_level()
         self._set_remote_log_level()
@@ -147,11 +149,11 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool:
 
         if f_init_not_connected and no_settings_in:
             do_reuse = self._first_run
-            self._first_run = False
         else:
             for message in self._f_init_cache.values():
                 if isinstance(message.data, ClosePort):
                     do_reuse = False
+        self._first_run = False
 
         max_f_init_next_timestamp = max(
                 (msg.next_timestamp
diff --git a/libmuscle/python/libmuscle/manager/run_dir.py b/libmuscle/python/libmuscle/manager/run_dir.py
index c2a50ed9..6a50c2fe 100644
--- a/libmuscle/python/libmuscle/manager/run_dir.py
+++ b/libmuscle/python/libmuscle/manager/run_dir.py
@@ -72,6 +72,8 @@ def snapshot_dir(self, name: Optional[Reference] = None) -> Path:
             The path to the snapshot directory
         """
         if name is None:
-            return self.path / 'snapshots'
+            path = self.path / 'snapshots'
         else:
-            return self.instance_dir(name) / 'snapshots'
+            path = self.instance_dir(name) / 'snapshots'
+        path.mkdir(exist_ok=True)
+        return path

From 869a23b6f42898a03110d235cd3731503bf20c05 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 15 Sep 2022 14:07:52 +0200
Subject: [PATCH 047/183] Make snapshot_registry threaded and thread-safe

---
 libmuscle/python/libmuscle/manager/manager.py | 10 ++++--
 .../libmuscle/manager/snapshot_registry.py    | 32 ++++++++++++++++++-
 .../manager/test/test_snapshot_registry.py    | 32 +++++++++----------
 3 files changed, 55 insertions(+), 19 deletions(-)

diff --git a/libmuscle/python/libmuscle/manager/manager.py b/libmuscle/python/libmuscle/manager/manager.py
index d25d1977..d96842a7 100644
--- a/libmuscle/python/libmuscle/manager/manager.py
+++ b/libmuscle/python/libmuscle/manager/manager.py
@@ -51,8 +51,6 @@ def __init__(
                 _logger.warning('Checkpoints are configured but no run'
                                 ' directory is provided. Snapshots will be'
                                 ' stored in the current working directory.')
-        self._snapshot_registry = SnapshotRegistry(
-                configuration, snapshot_dir, self._topology_store)
 
         if self._run_dir:
             save_ymmsl(
@@ -73,6 +71,12 @@ def __init__(
         except ValueError:
             pass
 
+        # SnapshotRegistry creates a worker thread, must be created after
+        # instance_manager which forks the process
+        self._snapshot_registry = SnapshotRegistry(
+                configuration, snapshot_dir, self._topology_store)
+        self._snapshot_registry.start()
+
         self._server = MMPServer(
                 self._logger, self._configuration,
                 self._instance_registry, self._topology_store,
@@ -108,6 +112,8 @@ def stop(self) -> None:
         """Shuts down the manager."""
         # self._server.stop()
         self._server.stop()
+        self._snapshot_registry.shutdown()
+        self._snapshot_registry.join()
         self._logger.close()
 
     def wait(self) -> bool:
diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py
index 6e7f1199..d43db295 100644
--- a/libmuscle/python/libmuscle/manager/snapshot_registry.py
+++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py
@@ -5,6 +5,8 @@
 from itertools import chain, zip_longest
 from operator import attrgetter
 from pathlib import Path
+from queue import Queue
+from threading import Thread
 from typing import Dict, Optional, Set, List, Tuple, TypeVar
 from libmuscle.manager.topology_store import TopologyStore
 
@@ -19,6 +21,7 @@
 
 _SnapshotDictType = Dict[Reference, List["SnapshotNode"]]
 _ConnectionType = Tuple[Identifier, Identifier, "_ConnectionInfo"]
+_QueueItemType = Optional[Tuple[Reference, SnapshotMetadata]]
 _T = TypeVar("_T")
 
 
@@ -157,7 +160,7 @@ def do_consistency_check(
         return True
 
 
-class SnapshotRegistry:
+class SnapshotRegistry(Thread):
     """Registry of all snapshots taken by instances.
 
     Current snapshots are stored in a graph. Every node represents a snapshot
@@ -176,6 +179,8 @@ def __init__(
         Args:
             config: ymmsl configuration describing the workflow.
         """
+        super().__init__(name='SnapshotRegistry')
+
         if config.model is None or not isinstance(config.model, Model):
             raise ValueError('The yMMSL experiment description does not'
                              ' contain a (complete) model section, so there'
@@ -185,6 +190,7 @@ def __init__(
         self._snapshot_folder = snapshot_folder
         self._topology_store = topology_store
 
+        self._queue = Queue()               # type: Queue[_QueueItemType]
         self._snapshots = {}                # type: _SnapshotDictType
 
         self._instances = set()             # type: Set[Reference]
@@ -199,6 +205,30 @@ def register_snapshot(
             self, instance: Reference, snapshot: SnapshotMetadata) -> None:
         """Register a new snapshot.
 
+        Args:
+            instance: The instance that created the snapshot
+            snapshot: Metadata describing the snapshot
+        """
+        self._queue.put((instance, snapshot))
+
+    def run(self) -> None:
+        """Code executed in a separate thread
+        """
+        while True:
+            item = self._queue.get()
+            if item is None:
+                return
+            self._add_snapshot(*item)
+
+    def shutdown(self) -> None:
+        """Stop the snapshot registry thread
+        """
+        self._queue.put(None)
+
+    def _add_snapshot(
+            self, instance: Reference, snapshot: SnapshotMetadata) -> None:
+        """Register a new snapshot.
+
         Args:
             instance: The instance that created the snapshot
             snapshot: Metadata describing the snapshot
diff --git a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py
index cc713c6a..71e3fb7c 100644
--- a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py
+++ b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py
@@ -259,7 +259,7 @@ def test_macro_micro_snapshots(
     micro = Reference('micro')
 
     macro_snapshot = make_snapshot(o_i=[3], s=[3])
-    snapshot_registry.register_snapshot(macro, macro_snapshot)
+    snapshot_registry._add_snapshot(macro, macro_snapshot)
 
     assert len(snapshot_registry._snapshots[macro]) == 1
     node = snapshot_registry._snapshots[macro][0]
@@ -281,14 +281,14 @@ def test_macro_micro_snapshots(
         # the macro snapshot above. However, it's still useful for testing the
         # consistency algorithm
         micro_snapshot = make_snapshot(f_i=[2], o_f=[1])
-        snapshot_registry.register_snapshot(micro, micro_snapshot)
+        snapshot_registry._add_snapshot(micro, micro_snapshot)
 
         assert len(snapshot_registry._snapshots[micro]) == 1
         assert not snapshot_registry._snapshots[micro][0].consistent
         snapshot_registry._write_snapshot_ymmsl.assert_not_called()
 
         micro_snapshot = make_snapshot(f_i=[3], o_f=[2])
-        snapshot_registry.register_snapshot(micro, micro_snapshot)
+        snapshot_registry._add_snapshot(micro, micro_snapshot)
 
         # micro snapshots should be cleaned up now!
         assert len(snapshot_registry._snapshots[micro]) == 1
@@ -299,7 +299,7 @@ def test_macro_micro_snapshots(
         snapshot_registry._write_snapshot_ymmsl.reset_mock()
 
         micro_snapshot = make_snapshot(f_i=[4], o_f=[3])
-        snapshot_registry.register_snapshot(micro, micro_snapshot)
+        snapshot_registry._add_snapshot(micro, micro_snapshot)
 
         # micro snapshots should be cleaned up now!
         assert len(snapshot_registry._snapshots[micro]) == 1
@@ -310,7 +310,7 @@ def test_macro_micro_snapshots(
         snapshot_registry._write_snapshot_ymmsl.reset_mock()
 
     macro_snapshot = make_snapshot(o_i=[4], s=[4])
-    snapshot_registry.register_snapshot(macro, macro_snapshot)
+    snapshot_registry._add_snapshot(macro, macro_snapshot)
     snapshot_registry._write_snapshot_ymmsl.assert_called_once()
 
 
@@ -324,14 +324,14 @@ def test_uq(uq: Configuration, micro_is_stateless: bool) -> None:
     rr = Reference('rr')
 
     qmc_snapshot = make_snapshot(parameters_out=[], states_in=[])
-    snapshot_registry.register_snapshot(qmc, qmc_snapshot)
+    snapshot_registry._add_snapshot(qmc, qmc_snapshot)
 
     rr_snapshot = make_snapshot(
             front_in=[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
             front_out=[0] * 10,
             back_out=[1, 1, 1, 1, 1],
             back_in=[0] * 5)
-    snapshot_registry.register_snapshot(rr, rr_snapshot)
+    snapshot_registry._add_snapshot(rr, rr_snapshot)
     node = snapshot_registry._snapshots[rr][-1]
     assert qmc in node.consistent_peers
     snapshot_registry._write_snapshot_ymmsl.assert_not_called()
@@ -339,7 +339,7 @@ def test_uq(uq: Configuration, micro_is_stateless: bool) -> None:
     macro_snapshot = make_snapshot(
             muscle_settings_in=[1], final_state_out=[0], o_i=[0], s=[0])
     for i in range(5):
-        snapshot_registry.register_snapshot(macro + i, macro_snapshot)
+        snapshot_registry._add_snapshot(macro + i, macro_snapshot)
         node = snapshot_registry._snapshots[macro + i][-1]
         assert node.consistent_peers.keys() == {rr}
         if micro_is_stateless and i == 4:
@@ -351,7 +351,7 @@ def test_uq(uq: Configuration, micro_is_stateless: bool) -> None:
     if not micro_is_stateless:
         micro_snapshot = make_snapshot(f_i=[1], o_f=[0])
         for i in range(5):
-            snapshot_registry.register_snapshot(micro + i, micro_snapshot)
+            snapshot_registry._add_snapshot(micro + i, micro_snapshot)
             node = snapshot_registry._snapshots[micro + i][-1]
             assert node.consistent_peers.keys() == {macro + i}
             if i == 4:
@@ -361,7 +361,7 @@ def test_uq(uq: Configuration, micro_is_stateless: bool) -> None:
                 snapshot_registry._write_snapshot_ymmsl.assert_not_called()
 
     qmc_snapshot = make_snapshot(parameters_out=[1, 1, 1, 1, 1], states_in=[])
-    snapshot_registry.register_snapshot(qmc, qmc_snapshot)
+    snapshot_registry._add_snapshot(qmc, qmc_snapshot)
     node = snapshot_registry._snapshots[qmc][-1]
     assert node.consistent_peers.keys() == {rr}
     snapshot_registry._write_snapshot_ymmsl.assert_called_once()
@@ -385,25 +385,25 @@ def test_heuristic_rollbacks() -> None:
     snapshot_registry._write_snapshot_ymmsl = MagicMock()
 
     for i in range(4):
-        snapshot_registry.register_snapshot(comp1, make_snapshot(o_f=[i]))
+        snapshot_registry._add_snapshot(comp1, make_snapshot(o_f=[i]))
     assert len(snapshot_registry._snapshots[comp1]) == 4
 
     for i in range(10):
-        snapshot_registry.register_snapshot(
+        snapshot_registry._add_snapshot(
                 comp2, make_snapshot(f_i=[1], o_f=[0]))
-        snapshot_registry.register_snapshot(
+        snapshot_registry._add_snapshot(
                 comp3, make_snapshot(f_i=[1], o_f=[0]))
     assert len(snapshot_registry._snapshots[comp2]) == 10
     assert len(snapshot_registry._snapshots[comp3]) == 10
 
-    snapshot_registry.register_snapshot(comp2, make_snapshot(f_i=[2], o_f=[1]))
+    snapshot_registry._add_snapshot(comp2, make_snapshot(f_i=[2], o_f=[1]))
     assert len(snapshot_registry._snapshots[comp2]) == 11
-    snapshot_registry.register_snapshot(comp2, make_snapshot(f_i=[3], o_f=[2]))
+    snapshot_registry._add_snapshot(comp2, make_snapshot(f_i=[3], o_f=[2]))
     assert len(snapshot_registry._snapshots[comp2]) == 12
 
     snapshot_registry._write_snapshot_ymmsl.assert_not_called()
 
-    snapshot_registry.register_snapshot(
+    snapshot_registry._add_snapshot(
             comp4, make_snapshot(f_i=[1]))
     snapshot_registry._write_snapshot_ymmsl.assert_called()
 

From d2c6a141d0fbb4b86b42d570c7c9cd8908ead392 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 15 Sep 2022 16:33:16 +0200
Subject: [PATCH 048/183] Snapshot/resume test with multiplicity

---
 integration_test/test_snapshot_macro_micro.py | 107 ++++++++++++++++--
 .../libmuscle/manager/snapshot_registry.py    |   2 +-
 2 files changed, 99 insertions(+), 10 deletions(-)

diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py
index bbfd42db..93427098 100644
--- a/integration_test/test_snapshot_macro_micro.py
+++ b/integration_test/test_snapshot_macro_micro.py
@@ -1,4 +1,5 @@
 import sys
+
 import pytest
 from ymmsl import Operator, load
 
@@ -7,6 +8,9 @@
 from libmuscle.manager.run_dir import RunDir
 
 
+_LOG_LEVEL = 'INFO'  # set to DEBUG for additional debug info
+
+
 def macro():
     instance = Instance({
             Operator.O_I: ['o_i'],
@@ -45,6 +49,46 @@ def macro():
             instance.save_final_snapshot(Message(t_cur, None, i))
 
 
+def macro_vector():
+    instance = Instance({
+            Operator.O_I: ['o_i[]'],
+            Operator.S: ['s[]']})
+
+    while instance.reuse_instance():
+        t_cur = instance.get_setting('t0', 'float')
+        dt = instance.get_setting('dt', 'float')
+        t_max = instance.get_setting('t_max', 'float')
+
+        if instance.resuming():
+            msg = instance.load_snapshot()
+            t_cur = msg.timestamp
+            assert msg.next_timestamp == pytest.approx(t_cur + dt)
+            i = msg.data
+            assert i >= 0
+        else:
+            i = 0
+
+        while t_cur + dt <= t_max:
+            t_next = t_cur + dt
+
+            if instance.should_save_snapshot(t_cur, t_next):
+                instance.save_snapshot(Message(t_cur, t_next, i))
+
+            t_next = None if t_next + dt > t_max else t_next
+            for slot in range(instance.get_port_length('o_i')):
+                instance.send('o_i', Message(t_cur, t_next, i), slot)
+
+            for slot in range(instance.get_port_length('s')):
+                msg = instance.receive('s', slot)
+                assert msg.data == i
+
+            i += 1
+            t_cur += dt
+
+        if instance.should_save_final_snapshot(t_cur):
+            instance.save_final_snapshot(Message(t_cur, None, i))
+
+
 def micro():
     instance = Instance({
             Operator.F_INIT: ['f_i'],
@@ -78,8 +122,9 @@ def micro():
         instance.send('o_f', Message(t_cur, None, i))
 
 
-def test_snapshot_macro_micro(tmp_path):
-    ymmsl_text = f"""ymmsl_version: v0.1
+@pytest.fixture
+def base_config():
+    return load(f"""ymmsl_version: v0.1
 model:
   name: test_snapshot
   components:
@@ -94,7 +139,7 @@ def test_snapshot_macro_micro(tmp_path):
   macro.t_max: 1.9
   micro.dt: 0.009
   micro.t_max: 0.1
-  muscle_remote_log_level: DEBUG
+  muscle_remote_log_level: {_LOG_LEVEL}
 implementations:
   macro_implementation:
     executable: {sys.executable}
@@ -115,11 +160,13 @@ def test_snapshot_macro_micro(tmp_path):
     threads: 1
 checkpoints:
   simulation_time:
-  - every: 0.4"""
-    ymmsl_doc = load(ymmsl_text)
+  - every: 0.4""")
+
 
+def test_snapshot_macro_micro(tmp_path, base_config):
+    base_config.check_consistent()
     run_dir1 = RunDir(tmp_path / 'run1')
-    manager = Manager(ymmsl_doc, run_dir1)
+    manager = Manager(base_config, run_dir1, _LOG_LEVEL)
     manager.start_instances()
     assert manager.wait()
 
@@ -139,10 +186,12 @@ def test_snapshot_macro_micro(tmp_path):
         assert snapshot_docs[i].resume['macro'] == macro_snapshots[i - 1]
         assert snapshot_docs[i].resume['micro'] == micro_snapshots[i - 1]
 
-    ymmsl_doc.update(snapshot_docs[4])
-    del ymmsl_doc.settings['muscle_snapshot_directory']
+    base_config.update(snapshot_docs[4])
+    del base_config.settings['muscle_snapshot_directory']
+    base_config.check_consistent()
+
     run_dir2 = RunDir(tmp_path / 'run2')
-    manager = Manager(ymmsl_doc, run_dir2)
+    manager = Manager(base_config, run_dir2, _LOG_LEVEL)
     manager.start_instances()
     assert manager.wait()
 
@@ -154,9 +203,49 @@ def test_snapshot_macro_micro(tmp_path):
     assert len(snapshots_ymmsl) == 2
 
 
+def test_snapshot_macro_vector_micro(tmp_path, base_config):
+    macro_implementation = base_config.implementations['macro_implementation']
+    macro_implementation.args[-1] = 'macro_vector'
+    base_config.model.components[1].multiplicity = [2]
+    base_config.check_consistent()
+
+    run_dir1 = RunDir(tmp_path / 'run1')
+    manager = Manager(base_config, run_dir1, _LOG_LEVEL)
+    manager.start_instances()
+    assert manager.wait()
+
+    macro_snapshots = sorted(run_dir1.snapshot_dir().glob('macro*'))
+    assert len(macro_snapshots) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
+    micro_snapshots = sorted(run_dir1.snapshot_dir().glob('micro*'))
+    assert len(micro_snapshots) == 6 * 2  # 0, 0.4, 0.8, 1.2, 1.6, final
+    snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl'))
+    # iff micro[0] snapshots before micro[1] at t==0.4, an additional workflow
+    # snapshot can be created
+    assert len(snapshots_ymmsl) in (7, 8)
+
+    snapshot_docs = list(map(load, sorted(snapshots_ymmsl)))
+    base_config.update(snapshot_docs[-3])
+    del base_config.settings['muscle_snapshot_directory']
+    base_config.check_consistent()
+
+    run_dir2 = RunDir(tmp_path / 'run2')
+    manager = Manager(base_config, run_dir2, _LOG_LEVEL)
+    manager.start_instances()
+    assert manager.wait()
+
+    macro_snapshots = sorted(run_dir2.snapshot_dir().glob('macro*'))
+    assert len(macro_snapshots) == 2  # 1.6, final
+    micro_snapshots = sorted(run_dir2.snapshot_dir().glob('micro*'))
+    assert len(micro_snapshots) == 3 * 2  # 1.2, 1.6, final
+    snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl'))
+    assert len(snapshots_ymmsl) == 2
+
+
 if __name__ == "__main__":
     if 'macro' in sys.argv:
         macro()
+    elif 'macro_vector' in sys.argv:
+        macro_vector()
     elif 'micro' in sys.argv:
         micro()
     else:
diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py
index d43db295..ed1618e3 100644
--- a/libmuscle/python/libmuscle/manager/snapshot_registry.py
+++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py
@@ -8,12 +8,12 @@
 from queue import Queue
 from threading import Thread
 from typing import Dict, Optional, Set, List, Tuple, TypeVar
-from libmuscle.manager.topology_store import TopologyStore
 
 from ymmsl import (
         Reference, Model, Identifier, Implementation, save,
         PartialConfiguration, ImplementationState as IState)
 
+from libmuscle.manager.topology_store import TopologyStore
 from libmuscle.snapshot import SnapshotMetadata
 
 

From 11bb7f817cc3569dde7a85338d4ccd948c17c2dd Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 16 Sep 2022 11:41:42 +0200
Subject: [PATCH 049/183] Remove no-longer-relevant TODO comments

---
 libmuscle/python/libmuscle/communicator.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py
index 7ffff004..d5ddfc39 100644
--- a/libmuscle/python/libmuscle/communicator.py
+++ b/libmuscle/python/libmuscle/communicator.py
@@ -79,7 +79,6 @@ def __init__(self, kernel: Reference, index: List[int],
             profiler: The profiler to use for recording sends and
                     receives.
         """
-        # TODO: pass a SnapshotManager and store as self._snapshot_manager
         self._kernel = kernel
         self._index = index
         self._declared_ports = declared_ports
@@ -418,8 +417,6 @@ def __ports_from_declared(self) -> Dict[str, Port]:
                 ports[port_name] = Port(
                         port_name, operator, is_vector, is_connected,
                         len(self._index), port_peer_dims)
-                # TODO: retrieve num_messages[] for this port from
-                # self._snapshot_manager when resuming
         return ports
 
     def __ports_from_conduits(self, conduits: List[Conduit]
@@ -451,8 +448,6 @@ def __ports_from_conduits(self, conduits: List[Conduit]
                 ports[str(port_id)] = Port(
                         str(port_id), operator, is_vector, is_connected,
                         len(self._index), port_peer_dims)
-                # TODO: retrieve num_messages[] for this port from
-                # self._snapshot_manager when resuming
         return ports
 
     def __settings_in_port(self, conduits: List[Conduit]) -> Port:
@@ -472,8 +467,6 @@ def __settings_in_port(self, conduits: List[Conduit]) -> Port:
                                     conduit.sending_component()))
         return Port('muscle_settings_in', Operator.F_INIT, False, False,
                     len(self._index), [])
-        # TODO: retrieve num_messages[] for this port from
-        # self._snapshot_manager when resuming
 
     def __get_client(self, instance: Reference) -> MPPClient:
         """Get or create a client to connect to the given instance.

From 58dc980981e7911f7af3a4dd2c115682d5b85ef5 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 16 Sep 2022 15:53:16 +0200
Subject: [PATCH 050/183] Remove redundant cast

---
 libmuscle/python/libmuscle/instance.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index 8c358665..b74507ed 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -165,7 +165,7 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool:
         # enabled, it might not exist and a KeyError is raised.
         try:
             snapshot_dir = self.get_setting('muscle_snapshot_directory', 'str')
-            snapshot_path = Path(cast(str, snapshot_dir))
+            snapshot_path = Path(snapshot_dir)
         except KeyError:
             snapshot_path = None
         self._snapshot_manager.reuse_instance(

From afb25c669aff0080cc36ab73c034399920930567 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 11 Oct 2022 16:52:53 +0200
Subject: [PATCH 051/183] Implement broadcasting (Python)

Allow multiple conduits connected to a single output port. The only
limitation is that the multiplicity of all connected peers must be the
same (to avoid interference with vector port functionality).
---
 integration_test/test_broadcast.py            | 44 +++++++++++++++
 libmuscle/python/libmuscle/communicator.py    | 53 +++++++++++++------
 libmuscle/python/libmuscle/peer_manager.py    | 44 +++++++++------
 .../libmuscle/test/test_communicator.py       | 43 +++++++--------
 4 files changed, 130 insertions(+), 54 deletions(-)
 create mode 100644 integration_test/test_broadcast.py

diff --git a/integration_test/test_broadcast.py b/integration_test/test_broadcast.py
new file mode 100644
index 00000000..0877cd8f
--- /dev/null
+++ b/integration_test/test_broadcast.py
@@ -0,0 +1,44 @@
+from ymmsl import (Component, Conduit, Configuration, Operator, Model,
+                   Settings)
+
+from libmuscle import Instance, Message
+from libmuscle.runner import run_simulation
+
+
+def broadcaster():
+    instance = Instance({Operator.O_F: ['out']})
+
+    while instance.reuse_instance():
+        # o_f
+        message = Message(0.0, None, 'testing')
+        instance.send('out', message)
+
+
+def receiver():
+    instance = Instance({Operator.F_INIT: ['in']})
+
+    while instance.reuse_instance():
+        # f_init
+        msg = instance.receive('in')
+        assert msg.data == 'testing'
+
+
+def test_broadcast(log_file_in_tmpdir):
+    elements = [
+            Component('broadcast', 'broadcaster'),
+            Component('first', 'receiver'),
+            Component('second', 'receiver')]
+
+    conduits = [
+                Conduit('broadcast.out', 'first.in'),
+                Conduit('broadcast.out', 'second.in')]
+
+    model = Model('test_model', elements, conduits)
+    settings = Settings()
+
+    configuration = Configuration(model, settings)
+
+    implementations = {
+            'broadcaster': broadcaster,
+            'receiver': receiver}
+    run_simulation(configuration, implementations)
diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py
index efdcb30d..fc24a9a8 100644
--- a/libmuscle/python/libmuscle/communicator.py
+++ b/libmuscle/python/libmuscle/communicator.py
@@ -206,27 +206,30 @@ def send_message(
             return
 
         port = self._ports[port_name]
-        profile_event = self._profiler.start(ProfileEventType.SEND, port,
-                                             None, slot, None)
 
-        recv_endpoint = self._peer_manager.get_peer_endpoint(
+        recv_endpoints = self._peer_manager.get_peer_endpoints(
                 snd_endpoint.port, slot_list)
 
         port_length = None
         if self._ports[port_name].is_resizable():
             port_length = self._ports[port_name].get_length()
 
-        mcp_message = MPPMessage(snd_endpoint.ref(), recv_endpoint.ref(),
-                                 port_length,
-                                 message.timestamp, message.next_timestamp,
-                                 cast(Settings, message.settings),
-                                 message.data)
-        encoded_message = mcp_message.encoded()
-        self._post_office.deposit(recv_endpoint.ref(), encoded_message)
-        profile_event.stop()
-        if port.is_vector():
-            profile_event.port_length = port.get_length()
-        profile_event.message_size = len(encoded_message)
+        for recv_endpoint in recv_endpoints:
+            profile_event = self._profiler.start(ProfileEventType.SEND, port,
+                                                 None, slot, None)
+
+            mcp_message = MPPMessage(snd_endpoint.ref(), recv_endpoint.ref(),
+                                     port_length,
+                                     message.timestamp, message.next_timestamp,
+                                     cast(Settings, message.settings),
+                                     message.data)
+            encoded_message = mcp_message.encoded()
+            self._post_office.deposit(recv_endpoint.ref(), encoded_message)
+
+            profile_event.stop()
+            if port.is_vector():
+                profile_event.port_length = port.get_length()
+            profile_event.message_size = len(encoded_message)
 
     def receive_message(self, port_name: str, slot: Optional[int] = None,
                         default: Optional[Message] = None
@@ -289,8 +292,10 @@ def receive_message(self, port_name: str, slot: Optional[int] = None,
         profile_event = self._profiler.start(ProfileEventType.RECEIVE, port,
                                              None, slot, None)
 
-        snd_endpoint = self._peer_manager.get_peer_endpoint(
-                recv_endpoint.port, slot_list)
+        # peer_manager already checks that there is at most one snd_endpoint
+        # connected to the port we receive on
+        snd_endpoint = self._peer_manager.get_peer_endpoints(
+                recv_endpoint.port, slot_list)[0]
         client = self.__get_client(snd_endpoint.instance())
         mcp_message_bytes = client.receive(recv_endpoint.ref())
         mcp_message = MPPMessage.from_bytes(mcp_message_bytes)
@@ -372,9 +377,23 @@ def __ports_from_declared(self) -> Dict[str, Port]:
                 port_id = Identifier(port_name)
                 is_connected = self._peer_manager.is_connected(port_id)
                 if is_connected:
-                    peer_port = self._peer_manager.get_peer_port(port_id)
+                    peer_ports = self._peer_manager.get_peer_ports(port_id)
+                    peer_port = peer_ports[0]
                     peer_ce = peer_port[:-1]
                     port_peer_dims = self._peer_manager.get_peer_dims(peer_ce)
+                    for peer_port in peer_ports[1:]:
+                        peer_ce = peer_port[:-1]
+                        if port_peer_dims != self._peer_manager.get_peer_dims(
+                                peer_ce):
+                            port_strs = ', '.join(map(str, peer_ports))
+                            raise RuntimeError(('Broadcast port "{}" is'
+                                                ' connected to peers with'
+                                                ' different dimensions. All'
+                                                ' peer components that this'
+                                                ' port is connected to must'
+                                                ' have the same multiplicity.'
+                                                ' Connected ports: {}.'
+                                                ).format(port_name, port_strs))
                 else:
                     port_peer_dims = []
                 ports[port_name] = Port(
diff --git a/libmuscle/python/libmuscle/peer_manager.py b/libmuscle/python/libmuscle/peer_manager.py
index 5b8728c1..0a7600c0 100644
--- a/libmuscle/python/libmuscle/peer_manager.py
+++ b/libmuscle/python/libmuscle/peer_manager.py
@@ -34,15 +34,21 @@ def __init__(self, kernel: Reference, index: List[int],
         self.__index = index
 
         # peer port ids, indexed by local kernel.port id
-        self.__peers = dict()  # type: Dict[Reference, Reference]
+        self.__peers = dict()  # type: Dict[Reference, List[Reference]]
 
         for conduit in conduits:
             if str(conduit.sending_component()) == str(kernel):
                 # we send on the port this conduit attaches to
-                self.__peers[conduit.sender] = conduit.receiver
+                self.__peers.setdefault(
+                        conduit.sender, []).append(conduit.receiver)
             if str(conduit.receiving_component()) == str(kernel):
                 # we receive on the port this conduit attaches to
-                self.__peers[conduit.receiver] = conduit.sender
+                if conduit.receiver in self.__peers:
+                    raise RuntimeError(('Receiving port "{}" is connected by'
+                                        ' multiple conduits, but at most one'
+                                        ' is allowed.'
+                                        ).format(conduit.receiving_port()))
+                self.__peers[conduit.receiver] = [conduit.sender]
 
         self.__peer_dims = peer_dims    # indexed by kernel id
         self.__peer_locations = peer_locations  # indexed by instance id
@@ -56,8 +62,8 @@ def is_connected(self, port: Identifier) -> bool:
         recv_port_full = self.__kernel + port
         return recv_port_full in self.__peers
 
-    def get_peer_port(self, port: Identifier) -> Reference:
-        """Get a reference for the peer port.
+    def get_peer_ports(self, port: Identifier) -> List[Reference]:
+        """Get a reference for the peer ports.
 
         Args:
             port: Name of the port on this side.
@@ -83,8 +89,8 @@ def get_peer_locations(self, peer_instance: Reference) -> List[str]:
         """
         return self.__peer_locations[peer_instance]
 
-    def get_peer_endpoint(self, port: Identifier, slot: List[int]
-                          ) -> Endpoint:
+    def get_peer_endpoints(self, port: Identifier, slot: List[int]
+                           ) -> List[Endpoint]:
         """Determine the peer endpoint for the given port and slot.
 
         Args:
@@ -94,14 +100,20 @@ def get_peer_endpoint(self, port: Identifier, slot: List[int]
         Returns:
             The peer endpoint.
         """
-        peer = self.__peers[self.__kernel + port]
-        peer_kernel = peer[:-1]
-        peer_port = cast(Identifier, peer[-1])
+        peers = self.__peers[self.__kernel + port]
+        endpoints = []
 
-        total_index = self.__index + slot
+        for peer in peers:
+            peer_kernel = peer[:-1]
+            peer_port = cast(Identifier, peer[-1])
 
-        # rebalance the indices
-        peer_dim = len(self.__peer_dims[peer_kernel])
-        peer_index = total_index[0:peer_dim]
-        peer_slot = total_index[peer_dim:]
-        return Endpoint(peer_kernel, peer_index, peer_port, peer_slot)
+            total_index = self.__index + slot
+
+            # rebalance the indices
+            peer_dim = len(self.__peer_dims[peer_kernel])
+            peer_index = total_index[0:peer_dim]
+            peer_slot = total_index[peer_dim:]
+            endpoints.append(
+                    Endpoint(peer_kernel, peer_index, peer_port, peer_slot))
+
+        return endpoints
diff --git a/libmuscle/python/libmuscle/test/test_communicator.py b/libmuscle/python/libmuscle/test/test_communicator.py
index 8f0f1238..e2414856 100644
--- a/libmuscle/python/libmuscle/test/test_communicator.py
+++ b/libmuscle/python/libmuscle/test/test_communicator.py
@@ -1,3 +1,4 @@
+from typing import List
 from libmuscle.communicator import Communicator, Endpoint, Message
 from libmuscle.mpp_message import ClosePort, MPPMessage
 from libmuscle.port import Port
@@ -43,26 +44,26 @@ def communicator() -> Communicator:
     pm = communicator._peer_manager
     pm.is_connected.return_value = True
 
-    def gpp(x) -> Reference:
+    def gpp(x) -> List[Reference]:
         if 'out' in str(x):
-            return Reference('in')
-        return Reference('out')
+            return [Reference('in')]
+        return [Reference('out')]
 
-    pm.get_peer_port = gpp
+    pm.get_peer_ports = gpp
 
     pm.get_peer_dims.return_value = []
     pm.get_peer_locations.return_value = ['direct:test']
 
-    def gpe(p, s) -> Reference:
+    def gpe(p, s) -> List[Reference]:
         endpoint = MagicMock()
         endpoint.instance.return_value = Reference('other')
         if 'out' in str(p):
             endpoint.ref.return_value = Reference('other.in[13]')
         else:
             endpoint.ref.return_value = Reference('other.out')
-        return endpoint
+        return [endpoint]
 
-    pm.get_peer_endpoint = gpe
+    pm.get_peer_endpoints = gpe
 
     communicator._ports = {
             'out': Port('out', Operator.O_I, False, True, 1, []),
@@ -79,26 +80,26 @@ def communicator2() -> Communicator:
     pm = communicator._peer_manager
     pm.is_connected.return_value = True
 
-    def gpp(x: Reference) -> Reference:
+    def gpp(x) -> List[Reference]:
         if 'out' in str(x):
-            return Reference('in')
-        return Reference('out')
+            return [Reference('in')]
+        return [Reference('out')]
 
-    pm.get_peer_port = gpp
+    pm.get_peer_ports = gpp
 
     pm.get_peer_dims.return_value = []
     pm.get_peer_locations.return_value = ['direct:test']
 
-    def gpe(p, s) -> Reference:
+    def gpe(p, s) -> List[Reference]:
         endpoint = MagicMock()
         endpoint.instance.return_value = Reference('kernel[13]')
         if 'out' in str(p):
             endpoint.ref.return_value = Reference('kernel[13].in')
         else:
             endpoint.ref.return_value = Reference('kernel[13].out')
-        return endpoint
+        return [endpoint]
 
-    pm.get_peer_endpoint = gpe
+    pm.get_peer_endpoints = gpe
 
     communicator._ports = {
             'out': Port('out', Operator.O_I, True, True, 0, [20]),
@@ -115,26 +116,26 @@ def communicator3() -> Communicator:
     pm = communicator._peer_manager
     pm.is_connected.return_value = True
 
-    def gpp(x: Reference) -> Reference:
+    def gpp(x) -> List[Reference]:
         if 'out' in str(x):
-            return Reference('in')
-        return Reference('out')
+            return [Reference('in')]
+        return [Reference('out')]
 
-    pm.get_peer_port = gpp
+    pm.get_peer_ports = gpp
 
     pm.get_peer_dims.return_value = []
     pm.get_peer_locations.return_value = ['direct:test']
 
-    def gpe(p, s) -> Reference:
+    def gpe(p, s) -> List[Reference]:
         endpoint = MagicMock()
         endpoint.instance.return_value = Reference('other')
         if 'out' in str(p):
             endpoint.ref.return_value = Reference('other.in[13]')
         else:
             endpoint.ref.return_value = Reference('other.out[13]')
-        return endpoint
+        return [endpoint]
 
-    pm.get_peer_endpoint = gpe
+    pm.get_peer_endpoints = gpe
 
     communicator._ports = {
             'out': Port('out', Operator.O_I, True, True, 0, []),

From 4e01031f61308bb0bdd109e954c7c6db977d1f76 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 13 Oct 2022 10:57:30 +0200
Subject: [PATCH 052/183] Rename 'broadcast' to 'multicast'

---
 integration_test/{test_broadcast.py => test_multicast.py} | 6 +++---
 libmuscle/python/libmuscle/communicator.py                | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)
 rename integration_test/{test_broadcast.py => test_multicast.py} (91%)

diff --git a/integration_test/test_broadcast.py b/integration_test/test_multicast.py
similarity index 91%
rename from integration_test/test_broadcast.py
rename to integration_test/test_multicast.py
index 0877cd8f..8dedee17 100644
--- a/integration_test/test_broadcast.py
+++ b/integration_test/test_multicast.py
@@ -5,7 +5,7 @@
 from libmuscle.runner import run_simulation
 
 
-def broadcaster():
+def multicaster():
     instance = Instance({Operator.O_F: ['out']})
 
     while instance.reuse_instance():
@@ -23,7 +23,7 @@ def receiver():
         assert msg.data == 'testing'
 
 
-def test_broadcast(log_file_in_tmpdir):
+def test_multicast(log_file_in_tmpdir):
     elements = [
             Component('broadcast', 'broadcaster'),
             Component('first', 'receiver'),
@@ -39,6 +39,6 @@ def test_broadcast(log_file_in_tmpdir):
     configuration = Configuration(model, settings)
 
     implementations = {
-            'broadcaster': broadcaster,
+            'broadcaster': multicaster,
             'receiver': receiver}
     run_simulation(configuration, implementations)
diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py
index fc24a9a8..cefd5e3b 100644
--- a/libmuscle/python/libmuscle/communicator.py
+++ b/libmuscle/python/libmuscle/communicator.py
@@ -386,13 +386,13 @@ def __ports_from_declared(self) -> Dict[str, Port]:
                         if port_peer_dims != self._peer_manager.get_peer_dims(
                                 peer_ce):
                             port_strs = ', '.join(map(str, peer_ports))
-                            raise RuntimeError(('Broadcast port "{}" is'
+                            raise RuntimeError(('Multicast port "{}" is'
                                                 ' connected to peers with'
                                                 ' different dimensions. All'
                                                 ' peer components that this'
                                                 ' port is connected to must'
                                                 ' have the same multiplicity.'
-                                                ' Connected ports: {}.'
+                                                ' Connected to ports: {}.'
                                                 ).format(port_name, port_strs))
                 else:
                     port_peer_dims = []

From 27c997c68d6dd33e6a596bdb8a1818fac98e22c8 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 14 Oct 2022 14:22:02 +0200
Subject: [PATCH 053/183] tox dependency to ymmsl branch feature/multicast

---
 tox.ini | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tox.ini b/tox.ini
index 0e2a1348..9c6c3968 100644
--- a/tox.ini
+++ b/tox.ini
@@ -8,6 +8,7 @@ deps =
     flake8
     pytest
     pytest-cov
+    git+https://github.com/multiscale/ymmsl-python.git@feature/multicast#egg=ymmsl
 
 passenv =
     MUSCLE_TEST_PYTHON_ONLY

From 839ced29a32d2c7278dd1b9007eecaabe6dc06d9 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 14 Oct 2022 14:22:51 +0200
Subject: [PATCH 054/183] Implement multicast in C++

Analogous to python implementation.
---
 libmuscle/cpp/src/libmuscle/communicator.cpp  | 51 ++++++++++++----
 libmuscle/cpp/src/libmuscle/peer_manager.cpp  | 60 +++++++++++++------
 libmuscle/cpp/src/libmuscle/peer_manager.hpp  | 15 ++---
 .../tests/mocks/mock_peer_manager.cpp         |  8 +--
 .../tests/mocks/mock_peer_manager.hpp         |  8 +--
 .../src/libmuscle/tests/test_communicator.cpp | 45 +++++++++-----
 .../src/libmuscle/tests/test_peer_manager.cpp | 24 ++++----
 7 files changed, 137 insertions(+), 74 deletions(-)

diff --git a/libmuscle/cpp/src/libmuscle/communicator.cpp b/libmuscle/cpp/src/libmuscle/communicator.cpp
index 4e9c5139..6b522259 100644
--- a/libmuscle/cpp/src/libmuscle/communicator.cpp
+++ b/libmuscle/cpp/src/libmuscle/communicator.cpp
@@ -120,7 +120,7 @@ void Communicator::send_message(
 
     // TODO start profile event
 
-    Endpoint recv_endpoint = peer_manager_->get_peer_endpoint(
+    auto recv_endpoints = peer_manager_->get_peer_endpoints(
             snd_endpoint.port, slot_list);
 
     Data settings_overlay(message.settings());
@@ -129,17 +129,18 @@ void Communicator::send_message(
     if (ports_.at(port_name).is_resizable())
         port_length = ports_.at(port_name).get_length();
 
-    MPPMessage mpp_message(
-            snd_endpoint.ref(), recv_endpoint.ref(),
-            port_length, message.timestamp(), Optional<double>(),
-            settings_overlay, message.data());
+    for (auto recv_endpoint : recv_endpoints) {
+        MPPMessage mpp_message(
+                snd_endpoint.ref(), recv_endpoint.ref(),
+                port_length, message.timestamp(), Optional<double>(),
+                settings_overlay, message.data());
 
-    if (message.has_next_timestamp())
-        mpp_message.next_timestamp = message.next_timestamp();
-
-    auto message_bytes = std::make_unique<DataConstRef>(mpp_message.encoded());
-    post_office_.deposit(recv_endpoint.ref(), std::move(message_bytes));
+        if (message.has_next_timestamp())
+            mpp_message.next_timestamp = message.next_timestamp();
 
+        auto message_bytes = std::make_unique<DataConstRef>(mpp_message.encoded());
+        post_office_.deposit(recv_endpoint.ref(), std::move(message_bytes));
+    }
     // TODO: stop and complete profile event
 }
 
@@ -177,8 +178,10 @@ Message Communicator::receive_message(
 
     // TODO start profile event
 
-    Endpoint snd_endpoint = peer_manager_->get_peer_endpoint(
-            recv_endpoint.port, slot_list);
+    // peer_manager already checks that there is at most one snd_endpoint
+    // connected to the port we receive on
+    Endpoint snd_endpoint = peer_manager_->get_peer_endpoints(
+            recv_endpoint.port, slot_list).at(0);
     MPPClient & client = get_client_(snd_endpoint.instance());
     auto mpp_message = MPPMessage::from_bytes(
             client.receive(recv_endpoint.ref()));
@@ -260,9 +263,31 @@ Communicator::Ports_ Communicator::ports_from_declared_() {
             bool is_connected = peer_manager_->is_connected(port_name);
             std::vector<int> port_peer_dims;
             if (is_connected) {
-                Reference peer_port = peer_manager_->get_peer_port(port_name);
+                auto peer_ports = peer_manager_->get_peer_ports(port_name);
+                Reference peer_port = peer_ports.at(0);
                 Reference peer_ce(peer_port.cbegin(), std::prev(peer_port.cend()));
                 port_peer_dims = peer_manager_->get_peer_dims(peer_ce);
+                for (std::size_t i = 1; i < peer_ports.size(); i++) {
+                    peer_port = peer_ports.at(i);
+                    peer_ce = Reference(peer_port.cbegin(), std::prev(peer_port.cend()));
+                    if (port_peer_dims != peer_manager_->get_peer_dims(peer_ce)) {
+                        std::stringstream ss;
+                        ss << "Multicast port \"" << port_name;
+                        ss << "\" is connected to peers with different";
+                        ss << " dimensions. All peer components that this";
+                        ss << " port is connected to must have the same";
+                        ss << " multiplicity. Connected to ports: ";
+                        bool first = true;
+                        for (auto port : peer_ports) {
+                            if (first)
+                                first = false;
+                            else
+                                ss << ", ";
+                            ss << port;
+                        }
+                        throw std::runtime_error(ss.str());
+                    }
+                }
             }
             ports.emplace(port_name, Port(
                     port_name, ppo.first, is_vector, is_connected,
diff --git a/libmuscle/cpp/src/libmuscle/peer_manager.cpp b/libmuscle/cpp/src/libmuscle/peer_manager.cpp
index 02631bc4..51772072 100644
--- a/libmuscle/cpp/src/libmuscle/peer_manager.cpp
+++ b/libmuscle/cpp/src/libmuscle/peer_manager.cpp
@@ -1,5 +1,6 @@
 #include <libmuscle/peer_manager.hpp>
 
+#include <sstream>
 
 using ymmsl::Conduit;
 using ymmsl::Identifier;
@@ -22,12 +23,26 @@ PeerManager::PeerManager(
     , peer_locations_(peer_locations)   // indexed by peer instance id
 {
     for (auto const & conduit : conduits) {
-        if (conduit.sending_component() == kernel_)
+        if (conduit.sending_component() == kernel_) {
             // we send on the port this conduit attaches to
-            peers_.emplace(conduit.sender, conduit.receiver);
-        if (conduit.receiving_component() == kernel_)
+            auto search = peers_.find(conduit.sender);
+            if (search == peers_.end())
+                search = peers_.emplace(
+                        conduit.sender, std::vector<Reference>()).first;
+            search->second.push_back(conduit.receiver);
+        }
+        if (conduit.receiving_component() == kernel_) {
             // we receive on the port this conduit attaches to
-            peers_.emplace(conduit.receiver, conduit.sender);
+            if (peers_.count(conduit.receiver)) {
+                std::stringstream ss;
+                ss << "Receiving port \"" << conduit.receiving_port();
+                ss << "\" is connected by multiple conduits, but at most one";
+                ss << " is allowed.";
+                throw std::runtime_error(ss.str());
+            }
+            std::vector<Reference> vec = {conduit.sender};
+            peers_.emplace(conduit.receiver, vec);
+        }
     }
 }
 
@@ -35,7 +50,8 @@ bool PeerManager::is_connected(Identifier const & port) const {
     return peers_.count(kernel_ + port);
 }
 
-Reference PeerManager::get_peer_port(Identifier const & port) const {
+ std::vector<ymmsl::Reference> const & PeerManager::get_peer_ports(
+            Identifier const & port) const {
     return peers_.at(kernel_ + port);
 }
 
@@ -49,24 +65,30 @@ std::vector<std::string> PeerManager::get_peer_locations(
     return peer_locations_.at(peer_instance);
 }
 
-Endpoint PeerManager::get_peer_endpoint(
+std::vector<Endpoint> const PeerManager::get_peer_endpoints(
         Identifier const & port,
         std::vector<int> const & slot
         ) const
 {
-    Reference peer = peers_.at(kernel_ + port);
-    Reference peer_kernel(peer.cbegin(), std::prev(peer.cend()));
-    Identifier peer_port = std::prev(peer.cend())->identifier();
-
-    std::vector<int> total_index = index_;
-    total_index.insert(total_index.end(), slot.cbegin(), slot.cend());
-
-    // rebalance the indices
-    int peer_dim = peer_dims_.at(peer_kernel).size();
-    auto peer_dim_it = std::next(total_index.cbegin(), peer_dim);
-    std::vector<int> peer_index(total_index.cbegin(), peer_dim_it);
-    std::vector<int> peer_slot(peer_dim_it, total_index.cend());
-    return Endpoint(peer_kernel, peer_index, peer_port, peer_slot);
+    auto peers = peers_.at(kernel_ + port);
+    std::vector<Endpoint> endpoints;
+
+    for (auto peer : peers) {
+        Reference peer_kernel(peer.cbegin(), std::prev(peer.cend()));
+        Identifier peer_port = std::prev(peer.cend())->identifier();
+
+        std::vector<int> total_index = index_;
+        total_index.insert(total_index.end(), slot.cbegin(), slot.cend());
+
+        // rebalance the indices
+        int peer_dim = peer_dims_.at(peer_kernel).size();
+        auto peer_dim_it = std::next(total_index.cbegin(), peer_dim);
+        std::vector<int> peer_index(total_index.cbegin(), peer_dim_it);
+        std::vector<int> peer_slot(peer_dim_it, total_index.cend());
+        endpoints.emplace_back(peer_kernel, peer_index, peer_port, peer_slot);
+    }
+
+    return endpoints;
 }
 
 } }
diff --git a/libmuscle/cpp/src/libmuscle/peer_manager.hpp b/libmuscle/cpp/src/libmuscle/peer_manager.hpp
index 715bb1aa..c6ac5ff7 100644
--- a/libmuscle/cpp/src/libmuscle/peer_manager.hpp
+++ b/libmuscle/cpp/src/libmuscle/peer_manager.hpp
@@ -53,12 +53,13 @@ class PeerManager {
          */
         bool is_connected(ymmsl::Identifier const & port) const;
 
-        /** Get a reference for the peer port.
+        /** Get a reference for all the peer ports.
          *
          * @param port Name of the port on this side.
-         * @return Name of the port on the peer.
+         * @return Names of the port on the peers.
          */
-        ymmsl::Reference get_peer_port(ymmsl::Identifier const & port) const;
+        std::vector<ymmsl::Reference> const & get_peer_ports(
+                ymmsl::Identifier const & port) const;
 
         /** Get the dimensions of a peer kernel.
          *
@@ -76,20 +77,20 @@ class PeerManager {
         std::vector<std::string> get_peer_locations(
                 ymmsl::Reference const & peer_instance) const;
 
-        /** Determine the peer endpoint for the given port and slot.
+        /** Determine the peer endpoints for the given port and slot.
          *
          * @param port The port on our side to send or receive on.
          * @param slot The slot to send or receive on.
-         * @return The peer endpoint.
+         * @return The peer endpoints.
          */
-        Endpoint get_peer_endpoint(
+        std::vector<Endpoint> const get_peer_endpoints(
                 ymmsl::Identifier const & port,
                 std::vector<int> const & slot) const;
 
     private:
         ymmsl::Reference kernel_;
         std::vector<int> index_;
-        std::unordered_map<ymmsl::Reference, ymmsl::Reference> peers_;
+        std::unordered_map<ymmsl::Reference, std::vector<ymmsl::Reference>> peers_;
         PeerDims peer_dims_;
         PeerLocations peer_locations_;
 };
diff --git a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_peer_manager.cpp b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_peer_manager.cpp
index 85893e56..850d8501 100644
--- a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_peer_manager.cpp
+++ b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_peer_manager.cpp
@@ -28,7 +28,7 @@ bool MockPeerManager::is_connected(Identifier const & port) const {
     return is_connected_return_value;
 }
 
-Reference MockPeerManager::get_peer_port(Identifier const & port) const {
+std::vector<Reference> MockPeerManager::get_peer_ports(Identifier const & port) const {
     return get_peer_port_table.at(port);
 }
 
@@ -42,7 +42,7 @@ std::vector<std::string> MockPeerManager::get_peer_locations(
     return std::vector<std::string>({std::string("tcp:test")});
 }
 
-Endpoint MockPeerManager::get_peer_endpoint(
+std::vector<Endpoint> MockPeerManager::get_peer_endpoints(
         Identifier const & port,
         std::vector<int> const & slot
         ) const
@@ -74,9 +74,9 @@ PeerDims MockPeerManager::last_constructed_peer_dims;
 PeerLocations MockPeerManager::last_constructed_peer_locations;
 
 bool MockPeerManager::is_connected_return_value;
-std::unordered_map<Identifier, Reference> MockPeerManager::get_peer_port_table;
+std::unordered_map<Identifier, std::vector<Reference>> MockPeerManager::get_peer_port_table;
 std::unordered_map<Reference, std::vector<int>> MockPeerManager::get_peer_dims_table;
-std::unordered_map<Reference, Endpoint> MockPeerManager::get_peer_endpoint_table;
+std::unordered_map<Reference, std::vector<Endpoint>> MockPeerManager::get_peer_endpoint_table;
 
 } }
 
diff --git a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_peer_manager.hpp b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_peer_manager.hpp
index e5448662..231ce403 100644
--- a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_peer_manager.hpp
+++ b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_peer_manager.hpp
@@ -26,14 +26,14 @@ class MockPeerManager {
 
         bool is_connected(ymmsl::Identifier const & port) const;
 
-        ymmsl::Reference get_peer_port(ymmsl::Identifier const & port) const;
+        std::vector<ymmsl::Reference> get_peer_ports(ymmsl::Identifier const & port) const;
 
         std::vector<int> get_peer_dims(ymmsl::Reference const & peer_kernel) const;
 
         std::vector<std::string> get_peer_locations(
                 ymmsl::Reference const & peer_instance) const;
 
-        Endpoint get_peer_endpoint(
+        std::vector<Endpoint> get_peer_endpoints(
                 ymmsl::Identifier const & port,
                 std::vector<int> const & slot) const;
 
@@ -48,11 +48,11 @@ class MockPeerManager {
         static PeerLocations last_constructed_peer_locations;
 
         static bool is_connected_return_value;
-        static std::unordered_map<ymmsl::Identifier, ymmsl::Reference>
+        static std::unordered_map<ymmsl::Identifier, std::vector<ymmsl::Reference>>
             get_peer_port_table;
         static std::unordered_map<ymmsl::Reference, std::vector<int>>
             get_peer_dims_table;
-        static std::unordered_map<ymmsl::Reference, Endpoint>
+        static std::unordered_map<ymmsl::Reference, std::vector<Endpoint>>
             get_peer_endpoint_table;
 };
 
diff --git a/libmuscle/cpp/src/libmuscle/tests/test_communicator.cpp b/libmuscle/cpp/src/libmuscle/tests/test_communicator.cpp
index 61ec225a..4286f452 100644
--- a/libmuscle/cpp/src/libmuscle/tests/test_communicator.cpp
+++ b/libmuscle/cpp/src/libmuscle/tests/test_communicator.cpp
@@ -109,8 +109,10 @@ std::unique_ptr<Communicator> connected_communicator() {
             {Reference("other"), {"tcp:test"}}});
 
     MockPeerManager::get_peer_dims_table.emplace("other", std::vector<int>({1}));
-    MockPeerManager::get_peer_endpoint_table.emplace("out", Endpoint("other", {}, "in", {13}));
-    MockPeerManager::get_peer_endpoint_table.emplace("in", Endpoint("other", {}, "out", {13}));
+    MockPeerManager::get_peer_endpoint_table.emplace("out",
+            std::vector<Endpoint>({Endpoint("other", {}, "in", {13})}));
+    MockPeerManager::get_peer_endpoint_table.emplace("in",
+            std::vector<Endpoint>({Endpoint("other", {}, "out", {13})}));
 
     comm->connect(conduits, peer_dims, peer_locations);
     return std::move(comm);
@@ -130,8 +132,10 @@ std::unique_ptr<Communicator> connected_communicator2() {
             {Reference("kernel"), {"tcp:test"}}});
 
     MockPeerManager::get_peer_dims_table.emplace("kernel", std::vector<int>({20}));
-    MockPeerManager::get_peer_endpoint_table.emplace("in[13]", Endpoint("kernel", {13}, "out", {}));
-    MockPeerManager::get_peer_endpoint_table.emplace("out[13]", Endpoint("kernel", {13}, "in", {}));
+    MockPeerManager::get_peer_endpoint_table.emplace("in[13]",
+            std::vector<Endpoint>({Endpoint("kernel", {13}, "out", {})}));
+    MockPeerManager::get_peer_endpoint_table.emplace("out[13]",
+            std::vector<Endpoint>({Endpoint("kernel", {13}, "in", {})}));
 
     comm->connect(conduits, peer_dims, peer_locations);
     return std::move(comm);
@@ -156,10 +160,14 @@ std::unique_ptr<Communicator> connected_communicator3() {
             {Reference("other"), {"tcp:test"}}});
 
     MockPeerManager::get_peer_dims_table.emplace("other", std::vector<int>({}));
-    MockPeerManager::get_peer_endpoint_table.emplace("out[13]", Endpoint("other", {}, "in", {13}));
-    MockPeerManager::get_peer_endpoint_table.emplace("in[13]", Endpoint("other", {}, "out", {13}));
-    MockPeerManager::get_peer_port_table.emplace("out", "other.in");
-    MockPeerManager::get_peer_port_table.emplace("in", "other.out");
+    MockPeerManager::get_peer_endpoint_table.emplace("out[13]",
+            std::vector<Endpoint>({Endpoint("other", {}, "in", {13})}));
+    MockPeerManager::get_peer_endpoint_table.emplace("in[13]",
+            std::vector<Endpoint>({Endpoint("other", {}, "out", {13})}));
+    MockPeerManager::get_peer_port_table.emplace("out",
+            std::vector<Reference>({"other.in"}));
+    MockPeerManager::get_peer_port_table.emplace("in",
+            std::vector<Reference>({"other.out"}));
 
     comm->connect(conduits, peer_dims, peer_locations);
     return std::move(comm);
@@ -239,9 +247,12 @@ TEST(libmuscle_communicator, test_connect_vector_ports) {
             {Reference("other3"), {"tcp:test3"}}
             });
 
-    MockPeerManager::get_peer_port_table.emplace("in", "other1.out");
-    MockPeerManager::get_peer_port_table.emplace("out1", "other.in");
-    MockPeerManager::get_peer_port_table.emplace("out2", "other3.in");
+    MockPeerManager::get_peer_port_table.emplace("in",
+            std::vector<Reference>({"other1.out"}));
+    MockPeerManager::get_peer_port_table.emplace("out1",
+            std::vector<Reference>({"other.in"}));
+    MockPeerManager::get_peer_port_table.emplace("out2",
+            std::vector<Reference>({"other3.in"}));
 
     MockPeerManager::get_peer_dims_table.emplace("other1", std::vector<int>({20, 7}));
     MockPeerManager::get_peer_dims_table.emplace("other", std::vector<int>({25}));
@@ -294,7 +305,8 @@ TEST(libmuscle_communicator, test_connect_multidimensional_ports) {
             {Reference("other"), {"tcp:test"}}
             });
 
-    MockPeerManager::get_peer_port_table.emplace("in", "other.out");
+    MockPeerManager::get_peer_port_table.emplace("in",
+            std::vector<Reference>({"other.out"}));
     MockPeerManager::get_peer_dims_table.emplace("other", std::vector<int>({20, 7, 30}));
 
     ASSERT_THROW(
@@ -330,9 +342,12 @@ TEST(libmuscle_communicator, test_connect_inferred_ports) {
             {Reference("other2"), {"tcp:test2"}}
             });
 
-    MockPeerManager::get_peer_port_table.emplace("in", "other1.out");
-    MockPeerManager::get_peer_port_table.emplace("out1", "other.in");
-    MockPeerManager::get_peer_port_table.emplace("out3", "other2.in");
+    MockPeerManager::get_peer_port_table.emplace("in",
+            std::vector<Reference>({"other1.out"}));
+    MockPeerManager::get_peer_port_table.emplace("out1",
+            std::vector<Reference>({"other.in"}));
+    MockPeerManager::get_peer_port_table.emplace("out3",
+            std::vector<Reference>({"other2.in"}));
 
     MockPeerManager::get_peer_dims_table.emplace("other1", std::vector<int>({20, 7}));
     MockPeerManager::get_peer_dims_table.emplace("other", std::vector<int>({25}));
diff --git a/libmuscle/cpp/src/libmuscle/tests/test_peer_manager.cpp b/libmuscle/cpp/src/libmuscle/tests/test_peer_manager.cpp
index 7e0f71db..eee0977a 100644
--- a/libmuscle/cpp/src/libmuscle/tests/test_peer_manager.cpp
+++ b/libmuscle/cpp/src/libmuscle/tests/test_peer_manager.cpp
@@ -82,16 +82,16 @@ TEST(libmuscle_peer_manager, is_connected) {
 
 TEST(libmuscle_peer_manager, get_peer_port) {
     auto pm = peer_manager();
-    ASSERT_EQ(pm.get_peer_port("out"), "other.in");
-    ASSERT_EQ(pm.get_peer_port("in"), "other.out");
+    ASSERT_EQ(pm.get_peer_ports("out"), std::vector<Reference>({"other.in"}));
+    ASSERT_EQ(pm.get_peer_ports("in"), std::vector<Reference>({"other.out"}));
 
     auto pm2 = peer_manager2();
-    ASSERT_EQ(pm2.get_peer_port("out"), "kernel.in");
-    ASSERT_EQ(pm2.get_peer_port("in"), "kernel.out");
+    ASSERT_EQ(pm2.get_peer_ports("out"), std::vector<Reference>({"kernel.in"}));
+    ASSERT_EQ(pm2.get_peer_ports("in"), std::vector<Reference>({"kernel.out"}));
 
     auto pm3 = peer_manager3();
-    ASSERT_EQ(pm3.get_peer_port("out"), "other.in");
-    ASSERT_EQ(pm3.get_peer_port("in"), "other.out");
+    ASSERT_EQ(pm3.get_peer_ports("out"), std::vector<Reference>({"other.in"}));
+    ASSERT_EQ(pm3.get_peer_ports("in"), std::vector<Reference>({"other.out"}));
 }
 
 TEST(libmuscle_peer_manager, get_peer_dims) {
@@ -121,16 +121,16 @@ TEST(libmuscle_peer_manager, get_peer_locations) {
 
 TEST(libmuscle_peer_manager, get_peer_endpoint) {
     auto pm = peer_manager();
-    ASSERT_EQ(std::string(pm.get_peer_endpoint("out", {})), "other.in[13]");
-    ASSERT_EQ(std::string(pm.get_peer_endpoint("in", {})), "other.out[13]");
+    ASSERT_EQ(std::string(pm.get_peer_endpoints("out", {})[0]), "other.in[13]");
+    ASSERT_EQ(std::string(pm.get_peer_endpoints("in", {})[0]), "other.out[13]");
 
     auto pm2 = peer_manager2();
-    ASSERT_EQ(std::string(pm2.get_peer_endpoint("out", {11})), "kernel[11].in");
-    ASSERT_EQ(std::string(pm2.get_peer_endpoint("in", {11})), "kernel[11].out");
+    ASSERT_EQ(std::string(pm2.get_peer_endpoints("out", {11})[0]), "kernel[11].in");
+    ASSERT_EQ(std::string(pm2.get_peer_endpoints("in", {11})[0]), "kernel[11].out");
 
     auto pm3 = peer_manager3();
-    ASSERT_EQ(std::string(pm3.get_peer_endpoint("out", {42})), "other.in[42]");
-    ASSERT_EQ(std::string(pm3.get_peer_endpoint("in", {42})), "other.out[42]");
+    ASSERT_EQ(std::string(pm3.get_peer_endpoints("out", {42})[0]), "other.in[42]");
+    ASSERT_EQ(std::string(pm3.get_peer_endpoints("in", {42})[0]), "other.out[42]");
 
 }
 

From 6876732f024fc8a55fdd432d9e03c4af257dd416 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 14 Oct 2022 14:23:03 +0200
Subject: [PATCH 055/183] Add integration test for c++ multicast

---
 integration_test/test_multicast_cpp.py | 91 ++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)
 create mode 100644 integration_test/test_multicast_cpp.py

diff --git a/integration_test/test_multicast_cpp.py b/integration_test/test_multicast_cpp.py
new file mode 100644
index 00000000..d97fc0d6
--- /dev/null
+++ b/integration_test/test_multicast_cpp.py
@@ -0,0 +1,91 @@
+from pathlib import Path
+import sys
+
+import ymmsl
+
+from libmuscle import Instance
+from libmuscle.manager.manager import Manager
+from libmuscle.manager.run_dir import RunDir
+
+# when executing this file as a component, .conftest cannot be resolved
+if __name__ == "__main__":
+    def skip_if_python_only(func):
+        return func
+else:
+    from .conftest import skip_if_python_only
+
+
+def receiver():
+    instance = Instance({ymmsl.Operator.F_INIT: ['in']})
+
+    i = 0
+    while instance.reuse_instance():
+        # f_init
+        msg = instance.receive('in')
+        assert msg.data == i
+        assert isinstance(msg.data, int)
+        i += 1
+
+
+@skip_if_python_only
+def test_multicast_cpp(tmpdir):
+    tmppath = Path(str(tmpdir))
+
+    # find our test component and its requirements
+    cpp_build_dir = Path(__file__).parents[1] / 'libmuscle' / 'cpp' / 'build'
+    lib_paths = [cpp_build_dir / 'msgpack' / 'msgpack' / 'lib']
+    ld_lib_path = ':'.join(map(str, lib_paths))
+
+    cpp_test_dir = cpp_build_dir / 'libmuscle' / 'tests'
+    test_component = cpp_test_dir / 'component_test'
+
+    # make config
+    ymmsl_text = f"""
+ymmsl_version: v0.1
+model:
+  name: test_model
+  components:
+    multicast:
+      implementation: component
+    receiver1:
+      implementation: receiver
+    receiver2:
+      implementation: receiver
+  conduits:
+    multicast.out:
+    - receiver1.in
+    - receiver2.in
+implementations:
+  component:
+    env:
+      LD_LIBRARY_PATH: {ld_lib_path}
+    executable: {test_component}
+  receiver:
+    executable: {sys.executable}
+    args:
+    - {__file__}
+resources:
+  multicast:
+    threads: 1
+  receiver1:
+    threads: 1
+  receiver2:
+    threads: 1"""
+
+    config = ymmsl.load(ymmsl_text)
+    config.as_configuration().check_consistent()
+
+    # set up
+    run_dir = RunDir(tmppath / 'run')
+
+    # launch MUSCLE Manager with simulation
+    manager = Manager(config, run_dir)
+    manager.start_instances()
+    success = manager.wait()
+
+    # check that all did not go well
+    assert success
+
+
+if __name__ == "__main__":
+    receiver()

From 4f979999aef870452104a74a2d04fc1b7e0b721f Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 18 Oct 2022 11:51:03 +0200
Subject: [PATCH 056/183] Ensure correct timestamp type in Message

Fixes #118

When an incorrect type is provided by the user for Message.timestamp or
Message.next_timestamp, MsgPack will serialize an invalid MMPMessage on
the wire. This leads to errors in statically typed peer actors.

Issue is fixed by explicitly converting to float in Message.__init__ and
checking again in MMPMessage.__init__ (as the user may have assigned
another value between creation of the Message and Instance.send).
---
 libmuscle/python/libmuscle/communicator.py | 5 +++++
 libmuscle/python/libmuscle/mpp_message.py  | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py
index efdcb30d..fec6e736 100644
--- a/libmuscle/python/libmuscle/communicator.py
+++ b/libmuscle/python/libmuscle/communicator.py
@@ -49,6 +49,11 @@ def __init__(self, timestamp: float, next_timestamp: Optional[float],
             data: An object to send or that was received.
             settings: Overlay settings to send or that were received.
         """
+        # make sure timestamp and next_timestamp are floats
+        timestamp = float(timestamp)
+        if next_timestamp is not None:
+            next_timestamp = float(next_timestamp)
+
         self.timestamp = timestamp
         self.next_timestamp = next_timestamp
         self.data = data
diff --git a/libmuscle/python/libmuscle/mpp_message.py b/libmuscle/python/libmuscle/mpp_message.py
index 15ff09f9..8b84aeab 100644
--- a/libmuscle/python/libmuscle/mpp_message.py
+++ b/libmuscle/python/libmuscle/mpp_message.py
@@ -171,6 +171,11 @@ def __init__(self, sender: Reference, receiver: Reference,
             settings_overlay: The serialised overlay settings.
             data: The serialised contents of the message.
         """
+        # make sure timestamp and next_timestamp are floats
+        timestamp = float(timestamp)
+        if next_timestamp is not None:
+            next_timestamp = float(next_timestamp)
+
         self.sender = sender
         self.receiver = receiver
         self.port_length = port_length

From 29e9132fd738635d49271fd1188b25ac992dcf85 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 18 Oct 2022 14:34:43 +0200
Subject: [PATCH 057/183] Add documentation on coupling with multicast

---
 docs/source/coupling.rst | 52 ++++++++++++++++++++++++++++++++++++++++
 docs/source/index.rst    |  1 +
 2 files changed, 53 insertions(+)
 create mode 100644 docs/source/coupling.rst

diff --git a/docs/source/coupling.rst b/docs/source/coupling.rst
new file mode 100644
index 00000000..8c764a97
--- /dev/null
+++ b/docs/source/coupling.rst
@@ -0,0 +1,52 @@
+Coupling your model
+===================
+
+Multicast
+---------
+
+With MUSCLE3 you can connect an output port to multiple input ports. This is
+called multicast. When a submodel sends a message on a port that is connected to
+multiple input ports, the message is copied and sent to each connected port.
+
+.. note::
+
+    It is not allowed to connect multiple output ports to a single input port.
+
+Example
+```````
+
+.. tabs::
+
+    .. code-tab:: yaml Basic macro/micro model configuration
+
+        ymmsl_version: v0.1
+        model:
+          name: multicast
+          components:
+            macro: macro
+            micro: micro
+          conduits:
+            macro.state_out: micro.state_in
+            micro.state_out: macro.state_in
+
+    .. code-tab:: yaml Extended configuration with multicast
+
+        ymmsl_version: v0.1
+        model:
+          name: multicast
+          components:
+            macro: macro
+            micro: micro
+            printer: printer
+          conduits:
+            macro.state_out: micro.state_in
+            micro.state_out:
+            - macro.state_in
+            - printer.in
+
+In the second tab, a new component `printer` is added and wired to the
+``state_out`` port of the micro model. Whenever the micro model sends a message
+on that port, one copy is sent to the macro model to continue the simulation.
+Another copy is sent to the printer component, which (for example) prints a
+summary of the state.
+
diff --git a/docs/source/index.rst b/docs/source/index.rst
index f9b2096c..ed55ba48 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -33,6 +33,7 @@ Cham.  `<https://doi.org/10.1007/978-3-030-50433-5_33>`_
    installing
    tutorial
    distributed_execution
+   coupling
    cplusplus
    fortran
    mpi

From 3de906f5fe4cab6192f5f9e111267d4b070bf6bb Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 24 Oct 2022 17:24:38 +0200
Subject: [PATCH 058/183] Fix make errors when CXX=clang++

---
 libmuscle/cpp/src/ymmsl/identity.cpp | 2 +-
 libmuscle/cpp/src/ymmsl/identity.hpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libmuscle/cpp/src/ymmsl/identity.cpp b/libmuscle/cpp/src/ymmsl/identity.cpp
index d9816b7f..e9bf7266 100644
--- a/libmuscle/cpp/src/ymmsl/identity.cpp
+++ b/libmuscle/cpp/src/ymmsl/identity.cpp
@@ -10,7 +10,7 @@
 using namespace std::string_literals;
 
 
-::std::size_t ::std::hash<::ymmsl::impl::Identifier>::operator()(
+::std::size_t (::std::hash<::ymmsl::impl::Identifier>::operator())(
         argument_type const & id) const noexcept
 {
     return hash<std::string>()(id.data_);
diff --git a/libmuscle/cpp/src/ymmsl/identity.hpp b/libmuscle/cpp/src/ymmsl/identity.hpp
index a6197d51..f06af00e 100644
--- a/libmuscle/cpp/src/ymmsl/identity.hpp
+++ b/libmuscle/cpp/src/ymmsl/identity.hpp
@@ -109,8 +109,8 @@ class Identifier {
         friend bool operator==(std::string const & lhs, Identifier const & rhs);
         friend bool operator!=(std::string const & lhs, Identifier const & rhs);
         friend std::ostream & operator<<(std::ostream & os, Identifier const & i);
-        friend ::std::size_t ::std::hash<::ymmsl::impl::Identifier>::operator()(
-                ::ymmsl::impl::Identifier const & id) const;
+        friend ::std::size_t (::std::hash<::ymmsl::impl::Identifier>::operator())(
+                ::ymmsl::impl::Identifier const & id) const noexcept;
         std::string data_;
 };
 

From 2003c1df9cdf87da75f68e4362adbbc69f28bc5d Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Sun, 30 Oct 2022 08:57:27 +0100
Subject: [PATCH 059/183] Update requirements.txt for documentation build

---
 docs/requirements.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/requirements.txt b/docs/requirements.txt
index 2508a213..98b8214f 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,7 +1,9 @@
 breathe
+click
 msgpack==0.6.1
 netifaces
 numpy>=1.12
+qcg-pilotjob
 six
 sphinx-fortran
 sphinx-tabs

From 39ad298c675126d9463c0d13495822e61a3dd55c Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Wed, 2 Nov 2022 13:55:01 +0100
Subject: [PATCH 060/183] Fix issue with dependency compilation on clang

---
 libmuscle/cpp/build/libmuscle/tests/Makefile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libmuscle/cpp/build/libmuscle/tests/Makefile b/libmuscle/cpp/build/libmuscle/tests/Makefile
index 0a22272b..327b4ddb 100644
--- a/libmuscle/cpp/build/libmuscle/tests/Makefile
+++ b/libmuscle/cpp/build/libmuscle/tests/Makefile
@@ -61,6 +61,9 @@ endif
 %.d: %.cpp
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -E -MM -MT $(@:.d=.o) $< -o $@
 
+mpi%.d: mpi%.cpp
+	$(MPICXX) $(CPPFLAGS) $(CXXFLAGS) -E -MM -MT $(@:.d=.o) $< -o $@
+
 %.o: %.cpp
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(DEBUGFLAGS) -c $< -o $@
 

From 78d9204b2db87b76dbf99760abfe35b5422373dc Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Wed, 2 Nov 2022 14:14:33 +0100
Subject: [PATCH 061/183] Update release docs to check RTD docs render before
 releasing.

---
 docs/source/releasing.rst | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/docs/source/releasing.rst b/docs/source/releasing.rst
index eff40de4..e93feb0f 100644
--- a/docs/source/releasing.rst
+++ b/docs/source/releasing.rst
@@ -8,6 +8,23 @@ branching model. Making a release involves quite a few steps, so they're listed
 here to help make the process more reliable; this information is really only
 useful for the maintainers.
 
+Check online documentation
+--------------------------
+
+Online documentation rendering on ReadTheDoc works a bit differently than local
+builds, as a result of which checking a local documentation build only partially
+ensures we get working online documentation. So this needs to be checked:
+
+- Check develop branch documentation is there
+- Specifically, check the Python API documentation page
+- Check the other languages too
+
+If the Python API docs are missing, then it's likely to be a dependency problem.
+Sphinx needs dependencies installed, and that's done differently by tox (which
+uses `setup.py`) and RTD (which uses `docs/requirements.txt`). If the latter is
+outdated, the Python API docs don't render because Sphinx fails to import the
+packages.
+
 Check metadata
 --------------
 

From 848ef7b6ef49ef593c6643695390979364fe363e Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 9 Aug 2022 13:30:41 +0200
Subject: [PATCH 062/183] Add `tox` as a [dev] dependency

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index ade2fa38..b99e2d06 100644
--- a/setup.py
+++ b/setup.py
@@ -61,7 +61,8 @@
         'dev': [
             'sphinx<3.2',
             'sphinx_rtd_theme',
-            'sphinx-fortran'
+            'sphinx-fortran',
+            'tox'
         ]
     },
 )

From 19fe89959cdf0b3591cfbde26c0c6c97768a3932 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Wed, 2 Nov 2022 15:34:21 +0100
Subject: [PATCH 063/183] Fix 'non-void function does not return' warnings

All fortran_c wrapper functions now return a default value when catching
an error from the C++ API. Fixes the compiler warnings: 'non-void
function does not return a value in all control paths [-Wreturn-type]'
---
 .../bindings/libmuscle_fortran_c.cpp          | 44 ++++++++++++++
 .../bindings/libmuscle_mpi_fortran_c.cpp      | 44 ++++++++++++++
 .../src/ymmsl/bindings/ymmsl_fortran_c.cpp    | 11 ++++
 scripts/api_generator.py                      | 58 +++++++++++++++++++
 scripts/make_libmuscle_api.py                 |  6 ++
 5 files changed, 163 insertions(+)

diff --git a/libmuscle/cpp/src/libmuscle/bindings/libmuscle_fortran_c.cpp b/libmuscle/cpp/src/libmuscle/bindings/libmuscle_fortran_c.cpp
index 1ed5ea5d..418a9c89 100644
--- a/libmuscle/cpp/src/libmuscle/bindings/libmuscle_fortran_c.cpp
+++ b/libmuscle/cpp/src/libmuscle/bindings/libmuscle_fortran_c.cpp
@@ -472,6 +472,7 @@ bool LIBMUSCLE_DataConstRef_as_logical_(std::intptr_t self, int * err_code, char
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 void LIBMUSCLE_DataConstRef_as_character_(std::intptr_t self, char ** ret_val, std::size_t * ret_val_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -563,6 +564,7 @@ int LIBMUSCLE_DataConstRef_as_int_(std::intptr_t self, int * err_code, char ** e
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 char LIBMUSCLE_DataConstRef_as_int1_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -607,6 +609,7 @@ char LIBMUSCLE_DataConstRef_as_int1_(std::intptr_t self, int * err_code, char **
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 short int LIBMUSCLE_DataConstRef_as_int2_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -651,6 +654,7 @@ short int LIBMUSCLE_DataConstRef_as_int2_(std::intptr_t self, int * err_code, ch
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 int32_t LIBMUSCLE_DataConstRef_as_int4_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -695,6 +699,7 @@ int32_t LIBMUSCLE_DataConstRef_as_int4_(std::intptr_t self, int * err_code, char
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 int64_t LIBMUSCLE_DataConstRef_as_int8_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -739,6 +744,7 @@ int64_t LIBMUSCLE_DataConstRef_as_int8_(std::intptr_t self, int * err_code, char
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 float LIBMUSCLE_DataConstRef_as_real4_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -783,6 +789,7 @@ float LIBMUSCLE_DataConstRef_as_real4_(std::intptr_t self, int * err_code, char
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0.0;
 }
 
 double LIBMUSCLE_DataConstRef_as_real8_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -827,6 +834,7 @@ double LIBMUSCLE_DataConstRef_as_real8_(std::intptr_t self, int * err_code, char
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0.0;
 }
 
 std::intptr_t LIBMUSCLE_DataConstRef_as_settings_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -871,6 +879,7 @@ std::intptr_t LIBMUSCLE_DataConstRef_as_settings_(std::intptr_t self, int * err_
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 void LIBMUSCLE_DataConstRef_as_byte_array_(
@@ -918,6 +927,7 @@ std::intptr_t LIBMUSCLE_DataConstRef_get_item_by_key_(
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::intptr_t LIBMUSCLE_DataConstRef_get_item_by_index_(
@@ -945,6 +955,7 @@ std::intptr_t LIBMUSCLE_DataConstRef_get_item_by_index_(
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::size_t LIBMUSCLE_DataConstRef_num_dims_(
@@ -963,6 +974,7 @@ std::size_t LIBMUSCLE_DataConstRef_num_dims_(
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 void LIBMUSCLE_DataConstRef_shape_(std::intptr_t self, std::size_t ** shp, std::size_t * shp_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -1209,6 +1221,7 @@ bool LIBMUSCLE_DataConstRef_has_indexes_(std::intptr_t self, int * err_code, cha
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 void LIBMUSCLE_DataConstRef_index_(std::intptr_t self, std::size_t i, char ** ret_val, std::size_t * ret_val_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -1709,6 +1722,7 @@ bool LIBMUSCLE_Data_as_logical_(std::intptr_t self, int * err_code, char ** err_
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 void LIBMUSCLE_Data_as_character_(std::intptr_t self, char ** ret_val, std::size_t * ret_val_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -1800,6 +1814,7 @@ int LIBMUSCLE_Data_as_int_(std::intptr_t self, int * err_code, char ** err_msg,
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 char LIBMUSCLE_Data_as_int1_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -1844,6 +1859,7 @@ char LIBMUSCLE_Data_as_int1_(std::intptr_t self, int * err_code, char ** err_msg
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 short int LIBMUSCLE_Data_as_int2_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -1888,6 +1904,7 @@ short int LIBMUSCLE_Data_as_int2_(std::intptr_t self, int * err_code, char ** er
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 int32_t LIBMUSCLE_Data_as_int4_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -1932,6 +1949,7 @@ int32_t LIBMUSCLE_Data_as_int4_(std::intptr_t self, int * err_code, char ** err_
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 int64_t LIBMUSCLE_Data_as_int8_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -1976,6 +1994,7 @@ int64_t LIBMUSCLE_Data_as_int8_(std::intptr_t self, int * err_code, char ** err_
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 float LIBMUSCLE_Data_as_real4_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -2020,6 +2039,7 @@ float LIBMUSCLE_Data_as_real4_(std::intptr_t self, int * err_code, char ** err_m
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0.0;
 }
 
 double LIBMUSCLE_Data_as_real8_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -2064,6 +2084,7 @@ double LIBMUSCLE_Data_as_real8_(std::intptr_t self, int * err_code, char ** err_
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0.0;
 }
 
 std::intptr_t LIBMUSCLE_Data_as_settings_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -2108,6 +2129,7 @@ std::intptr_t LIBMUSCLE_Data_as_settings_(std::intptr_t self, int * err_code, ch
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 void LIBMUSCLE_Data_as_byte_array_(
@@ -2155,6 +2177,7 @@ std::intptr_t LIBMUSCLE_Data_get_item_by_key_(
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::intptr_t LIBMUSCLE_Data_get_item_by_index_(
@@ -2182,6 +2205,7 @@ std::intptr_t LIBMUSCLE_Data_get_item_by_index_(
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::size_t LIBMUSCLE_Data_num_dims_(
@@ -2200,6 +2224,7 @@ std::size_t LIBMUSCLE_Data_num_dims_(
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 void LIBMUSCLE_Data_shape_(std::intptr_t self, std::size_t ** shp, std::size_t * shp_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -2446,6 +2471,7 @@ bool LIBMUSCLE_Data_has_indexes_(std::intptr_t self, int * err_code, char ** err
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 void LIBMUSCLE_Data_index_(std::intptr_t self, std::size_t i, char ** ret_val, std::size_t * ret_val_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -3444,6 +3470,7 @@ std::intptr_t LIBMUSCLE_Data_value_(
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::intptr_t LIBMUSCLE_PortsDescription_create_() {
@@ -3721,6 +3748,7 @@ bool LIBMUSCLE_Instance_is_setting_a_character_(std::intptr_t self, char * name,
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 bool LIBMUSCLE_Instance_is_setting_a_int8_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -3766,6 +3794,7 @@ bool LIBMUSCLE_Instance_is_setting_a_int8_(std::intptr_t self, char * name, std:
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 bool LIBMUSCLE_Instance_is_setting_a_real8_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -3811,6 +3840,7 @@ bool LIBMUSCLE_Instance_is_setting_a_real8_(std::intptr_t self, char * name, std
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 bool LIBMUSCLE_Instance_is_setting_a_logical_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -3856,6 +3886,7 @@ bool LIBMUSCLE_Instance_is_setting_a_logical_(std::intptr_t self, char * name, s
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 bool LIBMUSCLE_Instance_is_setting_a_real8array_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -3901,6 +3932,7 @@ bool LIBMUSCLE_Instance_is_setting_a_real8array_(std::intptr_t self, char * name
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 bool LIBMUSCLE_Instance_is_setting_a_real8array2_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -3946,6 +3978,7 @@ bool LIBMUSCLE_Instance_is_setting_a_real8array2_(std::intptr_t self, char * nam
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 void LIBMUSCLE_Instance_get_setting_as_character_(std::intptr_t self, char * name, std::size_t name_size, char ** ret_val, std::size_t * ret_val_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -4039,6 +4072,7 @@ int64_t LIBMUSCLE_Instance_get_setting_as_int8_(std::intptr_t self, char * name,
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 double LIBMUSCLE_Instance_get_setting_as_real8_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -4084,6 +4118,7 @@ double LIBMUSCLE_Instance_get_setting_as_real8_(std::intptr_t self, char * name,
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0.0;
 }
 
 bool LIBMUSCLE_Instance_get_setting_as_logical_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -4129,6 +4164,7 @@ bool LIBMUSCLE_Instance_get_setting_as_logical_(std::intptr_t self, char * name,
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 void LIBMUSCLE_Instance_get_setting_as_real8array_(std::intptr_t self, char * name, std::size_t name_size, double ** value, std::size_t * value_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -4337,6 +4373,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_p_(std::intptr_t self, char * port_name
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::intptr_t LIBMUSCLE_Instance_receive_pd_(std::intptr_t self, char * port_name, std::size_t port_name_size, std::intptr_t default_msg, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -4383,6 +4420,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_pd_(std::intptr_t self, char * port_nam
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::intptr_t LIBMUSCLE_Instance_receive_ps_(std::intptr_t self, char * port_name, std::size_t port_name_size, int slot, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -4428,6 +4466,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_ps_(std::intptr_t self, char * port_nam
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::intptr_t LIBMUSCLE_Instance_receive_psd_(std::intptr_t self, char * port_name, std::size_t port_name_size, int slot, std::intptr_t default_message, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -4474,6 +4513,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_psd_(std::intptr_t self, char * port_na
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::intptr_t LIBMUSCLE_Instance_receive_with_settings_p_(std::intptr_t self, char * port_name, std::size_t port_name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -4519,6 +4559,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_with_settings_p_(std::intptr_t self, ch
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::intptr_t LIBMUSCLE_Instance_receive_with_settings_pd_(std::intptr_t self, char * port_name, std::size_t port_name_size, std::intptr_t default_msg, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -4565,6 +4606,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_with_settings_pd_(std::intptr_t self, c
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::intptr_t LIBMUSCLE_Instance_receive_with_settings_ps_(std::intptr_t self, char * port_name, std::size_t port_name_size, int slot, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -4610,6 +4652,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_with_settings_ps_(std::intptr_t self, c
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::intptr_t LIBMUSCLE_Instance_receive_with_settings_psd_(std::intptr_t self, char * port_name, std::size_t port_name_size, int slot, std::intptr_t default_msg, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -4656,6 +4699,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_with_settings_psd_(std::intptr_t self,
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::intptr_t LIBMUSCLE_IMPL_BINDINGS_CmdLineArgs_create_(int count) {
diff --git a/libmuscle/cpp/src/libmuscle/bindings/libmuscle_mpi_fortran_c.cpp b/libmuscle/cpp/src/libmuscle/bindings/libmuscle_mpi_fortran_c.cpp
index c66b971f..877bb2a6 100644
--- a/libmuscle/cpp/src/libmuscle/bindings/libmuscle_mpi_fortran_c.cpp
+++ b/libmuscle/cpp/src/libmuscle/bindings/libmuscle_mpi_fortran_c.cpp
@@ -472,6 +472,7 @@ bool LIBMUSCLE_DataConstRef_as_logical_(std::intptr_t self, int * err_code, char
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 void LIBMUSCLE_DataConstRef_as_character_(std::intptr_t self, char ** ret_val, std::size_t * ret_val_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -563,6 +564,7 @@ int LIBMUSCLE_DataConstRef_as_int_(std::intptr_t self, int * err_code, char ** e
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 char LIBMUSCLE_DataConstRef_as_int1_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -607,6 +609,7 @@ char LIBMUSCLE_DataConstRef_as_int1_(std::intptr_t self, int * err_code, char **
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 short int LIBMUSCLE_DataConstRef_as_int2_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -651,6 +654,7 @@ short int LIBMUSCLE_DataConstRef_as_int2_(std::intptr_t self, int * err_code, ch
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 int32_t LIBMUSCLE_DataConstRef_as_int4_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -695,6 +699,7 @@ int32_t LIBMUSCLE_DataConstRef_as_int4_(std::intptr_t self, int * err_code, char
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 int64_t LIBMUSCLE_DataConstRef_as_int8_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -739,6 +744,7 @@ int64_t LIBMUSCLE_DataConstRef_as_int8_(std::intptr_t self, int * err_code, char
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 float LIBMUSCLE_DataConstRef_as_real4_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -783,6 +789,7 @@ float LIBMUSCLE_DataConstRef_as_real4_(std::intptr_t self, int * err_code, char
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0.0;
 }
 
 double LIBMUSCLE_DataConstRef_as_real8_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -827,6 +834,7 @@ double LIBMUSCLE_DataConstRef_as_real8_(std::intptr_t self, int * err_code, char
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0.0;
 }
 
 std::intptr_t LIBMUSCLE_DataConstRef_as_settings_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -871,6 +879,7 @@ std::intptr_t LIBMUSCLE_DataConstRef_as_settings_(std::intptr_t self, int * err_
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 void LIBMUSCLE_DataConstRef_as_byte_array_(
@@ -918,6 +927,7 @@ std::intptr_t LIBMUSCLE_DataConstRef_get_item_by_key_(
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::intptr_t LIBMUSCLE_DataConstRef_get_item_by_index_(
@@ -945,6 +955,7 @@ std::intptr_t LIBMUSCLE_DataConstRef_get_item_by_index_(
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::size_t LIBMUSCLE_DataConstRef_num_dims_(
@@ -963,6 +974,7 @@ std::size_t LIBMUSCLE_DataConstRef_num_dims_(
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 void LIBMUSCLE_DataConstRef_shape_(std::intptr_t self, std::size_t ** shp, std::size_t * shp_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -1209,6 +1221,7 @@ bool LIBMUSCLE_DataConstRef_has_indexes_(std::intptr_t self, int * err_code, cha
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 void LIBMUSCLE_DataConstRef_index_(std::intptr_t self, std::size_t i, char ** ret_val, std::size_t * ret_val_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -1709,6 +1722,7 @@ bool LIBMUSCLE_Data_as_logical_(std::intptr_t self, int * err_code, char ** err_
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 void LIBMUSCLE_Data_as_character_(std::intptr_t self, char ** ret_val, std::size_t * ret_val_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -1800,6 +1814,7 @@ int LIBMUSCLE_Data_as_int_(std::intptr_t self, int * err_code, char ** err_msg,
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 char LIBMUSCLE_Data_as_int1_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -1844,6 +1859,7 @@ char LIBMUSCLE_Data_as_int1_(std::intptr_t self, int * err_code, char ** err_msg
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 short int LIBMUSCLE_Data_as_int2_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -1888,6 +1904,7 @@ short int LIBMUSCLE_Data_as_int2_(std::intptr_t self, int * err_code, char ** er
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 int32_t LIBMUSCLE_Data_as_int4_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -1932,6 +1949,7 @@ int32_t LIBMUSCLE_Data_as_int4_(std::intptr_t self, int * err_code, char ** err_
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 int64_t LIBMUSCLE_Data_as_int8_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -1976,6 +1994,7 @@ int64_t LIBMUSCLE_Data_as_int8_(std::intptr_t self, int * err_code, char ** err_
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 float LIBMUSCLE_Data_as_real4_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -2020,6 +2039,7 @@ float LIBMUSCLE_Data_as_real4_(std::intptr_t self, int * err_code, char ** err_m
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0.0;
 }
 
 double LIBMUSCLE_Data_as_real8_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -2064,6 +2084,7 @@ double LIBMUSCLE_Data_as_real8_(std::intptr_t self, int * err_code, char ** err_
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0.0;
 }
 
 std::intptr_t LIBMUSCLE_Data_as_settings_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -2108,6 +2129,7 @@ std::intptr_t LIBMUSCLE_Data_as_settings_(std::intptr_t self, int * err_code, ch
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 void LIBMUSCLE_Data_as_byte_array_(
@@ -2155,6 +2177,7 @@ std::intptr_t LIBMUSCLE_Data_get_item_by_key_(
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::intptr_t LIBMUSCLE_Data_get_item_by_index_(
@@ -2182,6 +2205,7 @@ std::intptr_t LIBMUSCLE_Data_get_item_by_index_(
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::size_t LIBMUSCLE_Data_num_dims_(
@@ -2200,6 +2224,7 @@ std::size_t LIBMUSCLE_Data_num_dims_(
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 void LIBMUSCLE_Data_shape_(std::intptr_t self, std::size_t ** shp, std::size_t * shp_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -2446,6 +2471,7 @@ bool LIBMUSCLE_Data_has_indexes_(std::intptr_t self, int * err_code, char ** err
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 void LIBMUSCLE_Data_index_(std::intptr_t self, std::size_t i, char ** ret_val, std::size_t * ret_val_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -3444,6 +3470,7 @@ std::intptr_t LIBMUSCLE_Data_value_(
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::intptr_t LIBMUSCLE_PortsDescription_create_() {
@@ -3728,6 +3755,7 @@ bool LIBMUSCLE_Instance_is_setting_a_character_(std::intptr_t self, char * name,
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 bool LIBMUSCLE_Instance_is_setting_a_int8_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -3773,6 +3801,7 @@ bool LIBMUSCLE_Instance_is_setting_a_int8_(std::intptr_t self, char * name, std:
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 bool LIBMUSCLE_Instance_is_setting_a_real8_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -3818,6 +3847,7 @@ bool LIBMUSCLE_Instance_is_setting_a_real8_(std::intptr_t self, char * name, std
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 bool LIBMUSCLE_Instance_is_setting_a_logical_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -3863,6 +3893,7 @@ bool LIBMUSCLE_Instance_is_setting_a_logical_(std::intptr_t self, char * name, s
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 bool LIBMUSCLE_Instance_is_setting_a_real8array_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -3908,6 +3939,7 @@ bool LIBMUSCLE_Instance_is_setting_a_real8array_(std::intptr_t self, char * name
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 bool LIBMUSCLE_Instance_is_setting_a_real8array2_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -3953,6 +3985,7 @@ bool LIBMUSCLE_Instance_is_setting_a_real8array2_(std::intptr_t self, char * nam
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 void LIBMUSCLE_Instance_get_setting_as_character_(std::intptr_t self, char * name, std::size_t name_size, char ** ret_val, std::size_t * ret_val_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -4046,6 +4079,7 @@ int64_t LIBMUSCLE_Instance_get_setting_as_int8_(std::intptr_t self, char * name,
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 double LIBMUSCLE_Instance_get_setting_as_real8_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -4091,6 +4125,7 @@ double LIBMUSCLE_Instance_get_setting_as_real8_(std::intptr_t self, char * name,
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0.0;
 }
 
 bool LIBMUSCLE_Instance_get_setting_as_logical_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -4136,6 +4171,7 @@ bool LIBMUSCLE_Instance_get_setting_as_logical_(std::intptr_t self, char * name,
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 void LIBMUSCLE_Instance_get_setting_as_real8array_(std::intptr_t self, char * name, std::size_t name_size, double ** value, std::size_t * value_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -4344,6 +4380,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_p_(std::intptr_t self, char * port_name
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::intptr_t LIBMUSCLE_Instance_receive_pd_(std::intptr_t self, char * port_name, std::size_t port_name_size, std::intptr_t default_msg, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -4390,6 +4427,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_pd_(std::intptr_t self, char * port_nam
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::intptr_t LIBMUSCLE_Instance_receive_ps_(std::intptr_t self, char * port_name, std::size_t port_name_size, int slot, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -4435,6 +4473,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_ps_(std::intptr_t self, char * port_nam
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::intptr_t LIBMUSCLE_Instance_receive_psd_(std::intptr_t self, char * port_name, std::size_t port_name_size, int slot, std::intptr_t default_message, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -4481,6 +4520,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_psd_(std::intptr_t self, char * port_na
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::intptr_t LIBMUSCLE_Instance_receive_with_settings_p_(std::intptr_t self, char * port_name, std::size_t port_name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -4526,6 +4566,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_with_settings_p_(std::intptr_t self, ch
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::intptr_t LIBMUSCLE_Instance_receive_with_settings_pd_(std::intptr_t self, char * port_name, std::size_t port_name_size, std::intptr_t default_msg, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -4572,6 +4613,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_with_settings_pd_(std::intptr_t self, c
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::intptr_t LIBMUSCLE_Instance_receive_with_settings_ps_(std::intptr_t self, char * port_name, std::size_t port_name_size, int slot, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -4617,6 +4659,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_with_settings_ps_(std::intptr_t self, c
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::intptr_t LIBMUSCLE_Instance_receive_with_settings_psd_(std::intptr_t self, char * port_name, std::size_t port_name_size, int slot, std::intptr_t default_msg, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -4663,6 +4706,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_with_settings_psd_(std::intptr_t self,
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 std::intptr_t LIBMUSCLE_IMPL_BINDINGS_CmdLineArgs_create_(int count) {
diff --git a/libmuscle/cpp/src/ymmsl/bindings/ymmsl_fortran_c.cpp b/libmuscle/cpp/src/ymmsl/bindings/ymmsl_fortran_c.cpp
index f642e515..6c7e7ff5 100644
--- a/libmuscle/cpp/src/ymmsl/bindings/ymmsl_fortran_c.cpp
+++ b/libmuscle/cpp/src/ymmsl/bindings/ymmsl_fortran_c.cpp
@@ -87,6 +87,7 @@ bool YMMSL_Settings_is_a_character_(std::intptr_t self, char * key, std::size_t
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 bool YMMSL_Settings_is_a_int4_(std::intptr_t self, char * key, std::size_t key_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -132,6 +133,7 @@ bool YMMSL_Settings_is_a_int4_(std::intptr_t self, char * key, std::size_t key_s
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 bool YMMSL_Settings_is_a_int8_(std::intptr_t self, char * key, std::size_t key_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -177,6 +179,7 @@ bool YMMSL_Settings_is_a_int8_(std::intptr_t self, char * key, std::size_t key_s
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 bool YMMSL_Settings_is_a_real8_(std::intptr_t self, char * key, std::size_t key_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -222,6 +225,7 @@ bool YMMSL_Settings_is_a_real8_(std::intptr_t self, char * key, std::size_t key_
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 bool YMMSL_Settings_is_a_logical_(std::intptr_t self, char * key, std::size_t key_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -267,6 +271,7 @@ bool YMMSL_Settings_is_a_logical_(std::intptr_t self, char * key, std::size_t ke
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 bool YMMSL_Settings_is_a_real8array_(std::intptr_t self, char * key, std::size_t key_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -312,6 +317,7 @@ bool YMMSL_Settings_is_a_real8array_(std::intptr_t self, char * key, std::size_t
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 bool YMMSL_Settings_is_a_real8array2_(std::intptr_t self, char * key, std::size_t key_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -357,6 +363,7 @@ bool YMMSL_Settings_is_a_real8array2_(std::intptr_t self, char * key, std::size_
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 void YMMSL_Settings_set_character_(std::intptr_t self, char * key, std::size_t key_size, char * value, std::size_t value_size) {
@@ -506,6 +513,7 @@ int32_t YMMSL_Settings_get_as_int4_(std::intptr_t self, char * key, std::size_t
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 int64_t YMMSL_Settings_get_as_int8_(std::intptr_t self, char * key, std::size_t key_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -551,6 +559,7 @@ int64_t YMMSL_Settings_get_as_int8_(std::intptr_t self, char * key, std::size_t
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0;
 }
 
 double YMMSL_Settings_get_as_real8_(std::intptr_t self, char * key, std::size_t key_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -596,6 +605,7 @@ double YMMSL_Settings_get_as_real8_(std::intptr_t self, char * key, std::size_t
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return 0.0;
 }
 
 bool YMMSL_Settings_get_as_logical_(std::intptr_t self, char * key, std::size_t key_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
@@ -641,6 +651,7 @@ bool YMMSL_Settings_get_as_logical_(std::intptr_t self, char * key, std::size_t
         *err_msg = const_cast<char*>(msg.data());
         *err_msg_len = msg.size();
     }
+    return false;
 }
 
 void YMMSL_Settings_get_as_real8array_(std::intptr_t self, char * key, std::size_t key_size, double ** value, std::size_t * value_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) {
diff --git a/scripts/api_generator.py b/scripts/api_generator.py
index 2823c4fc..e6c4f5a6 100644
--- a/scripts/api_generator.py
+++ b/scripts/api_generator.py
@@ -132,6 +132,9 @@ def f_call_c(self, result_name: str, call: str) -> str:
     def f_return_result(self, return_name: str, result_name: str) -> str:
         return ''
 
+    def fc_return_default(self) -> str:
+        return ''  # memfun has void signature
+
 
 class String(Par):
     """Represents a string-typed parameter.
@@ -197,6 +200,9 @@ def fc_return(self) -> str:
                 '    *{0}_size = result.size();\n'
                 '    return;\n').format(self.name)
 
+    def fc_return_default(self) -> str:
+        return ''  # memfun has void signature
+
 
 class VecDbl(Par):
     """Represents a vector of double parameter.
@@ -263,6 +269,9 @@ def fc_return(self) -> str:
                 '    *{0}_size = result.size();\n'
                 '    return;\n').format(self.name)
 
+    def fc_return_default(self) -> str:
+        return ''  # memfun has void signature
+
 
 class Vec2Dbl(Par):
     """Represents a vector of vector of double parameter.
@@ -350,6 +359,9 @@ def fc_return(self) -> str:
 
         return textwrap.indent(result.format(self.name), '    ')
 
+    def fc_return_default(self) -> str:
+        return ''  # memfun has void signature
+
 
 class VecSizet(Par):
     """Represents a vector of size_t parameter.
@@ -416,6 +428,9 @@ def fc_return(self) -> str:
                 '    *{0}_size = result.size();\n'
                 '    return;\n').format(self.name)
 
+    def fc_return_default(self) -> str:
+        return ''  # memfun has void signature
+
 
 class Array(Par):
     def __init__(
@@ -557,6 +572,9 @@ def fc_return(self) -> str:
                               self.ndims),
                 '    ')
 
+    def fc_return_default(self) -> str:
+        return ''  # memfun has void signature
+
     def _f_dims(self) -> str:
         return ', '.join([':'] * self.ndims)
 
@@ -624,6 +642,9 @@ def fc_return(self) -> str:
                 '    *{0}_size = result.size();\n'
                 '    return;\n').format(self.name)
 
+    def fc_return_default(self) -> str:
+        return ''  # memfun has void signature
+
 
 class Obj(Par):
     """Represents an object of a type to pass.
@@ -686,6 +707,9 @@ def f_call_c(self, result_name: str, call: str) -> str:
     def f_return_result(self, return_name: str, result_name: str) -> str:
         return '    {}%ptr = {}\n'.format(return_name, result_name)
 
+    def fc_return_default(self) -> str:
+        return '    return 0;\n'
+
 
 class Bool(Par):
     """Represents a bool-typed parameter.
@@ -732,6 +756,9 @@ def f_call_c(self, result_name: str, call: str) -> str:
     def f_return_result(self, return_name: str, result_name: str) -> str:
         return '    {} = {}\n'.format(return_name, result_name)
 
+    def fc_return_default(self) -> str:
+        return '    return false;\n'
+
 
 class EnumVal(Par):
     """Represents an enum-typed parameter.
@@ -784,6 +811,9 @@ def fc_get_result(self, cpp_chain_call: str) -> str:
     def fc_return(self) -> str:
         return '    return static_cast<int>(result);\n'
 
+    def fc_return_default(self) -> str:
+        return '    return 0;\n'
+
 
 class Int(Par):
     """Represents an int-typed parameter.
@@ -821,6 +851,9 @@ def fc_get_result(self, cpp_chain_call: str) -> str:
     def fc_return(self) -> str:
         return '    return result;\n'
 
+    def fc_return_default(self) -> str:
+        return '    return 0;\n'
+
 
 class Char(Par):
     """Represents an char-typed parameter.
@@ -859,6 +892,9 @@ def fc_get_result(self, cpp_chain_call: str) -> str:
     def fc_return(self) -> str:
         return '    return result;\n'
 
+    def fc_return_default(self) -> str:
+        return '    return 0;\n'
+
 
 class Int16t(Par):
     """Represents an int16_t-typed parameter.
@@ -896,6 +932,9 @@ def fc_get_result(self, cpp_chain_call: str) -> str:
     def fc_return(self) -> str:
         return '    return result;\n'
 
+    def fc_return_default(self) -> str:
+        return '    return 0;\n'
+
 
 class Int32t(Par):
     """Represents an int32_t-typed parameter.
@@ -934,6 +973,9 @@ def fc_get_result(self, cpp_chain_call: str) -> str:
     def fc_return(self) -> str:
         return '    return result;\n'
 
+    def fc_return_default(self) -> str:
+        return '    return 0;\n'
+
 
 class Int64t(Par):
     """Represents an int64_t-typed parameter.
@@ -971,6 +1013,9 @@ def fc_get_result(self, cpp_chain_call: str) -> str:
     def fc_return(self) -> str:
         return '    return result;\n'
 
+    def fc_return_default(self) -> str:
+        return '    return 0;\n'
+
 
 class Sizet(Par):
     """Represents an size_t-typed parameter.
@@ -1009,6 +1054,9 @@ def fc_get_result(self, cpp_chain_call: str) -> str:
     def fc_return(self) -> str:
         return '    return result;\n'
 
+    def fc_return_default(self) -> str:
+        return '    return 0;\n'
+
 
 class Float(Par):
     """Represents a single precision float parameter.
@@ -1047,6 +1095,9 @@ def fc_get_result(self, cpp_chain_call: str) -> str:
     def fc_return(self) -> str:
         return '    return result;\n'
 
+    def fc_return_default(self) -> str:
+        return '    return 0.0;\n'
+
 
 class Double(Par):
     """Represents a double precision float parameter.
@@ -1085,6 +1136,9 @@ def fc_get_result(self, cpp_chain_call: str) -> str:
     def fc_return(self) -> str:
         return '    return result;\n'
 
+    def fc_return_default(self) -> str:
+        return '    return 0.0;\n'
+
 
 class T(Par):
     """Represents a template dummy type.
@@ -1227,6 +1281,7 @@ def fortran_c_wrapper(self) -> str:
                     catch += '    *err_msg_len = msg.size();\n'
                     catch += '}\n'
                     result += textwrap.indent(catch, 4*' ')
+            result += self._fc_return_default()
         else:
             result += self._fc_cpp_call()
             result += self._fc_return()
@@ -1436,6 +1491,9 @@ def _fc_cpp_call(self) -> str:
     def _fc_return(self) -> str:
         return self.ret_type.fc_return()
 
+    def _fc_return_default(self) -> str:
+        return self.ret_type.fc_return_default()
+
     def _fc_in_parameters(self) -> List[str]:
         """Create a list of input parameters.
         """
diff --git a/scripts/make_libmuscle_api.py b/scripts/make_libmuscle_api.py
index 33a17a52..0b083a2e 100755
--- a/scripts/make_libmuscle_api.py
+++ b/scripts/make_libmuscle_api.py
@@ -451,6 +451,7 @@ def __copy__(self) -> 'Elements':
                 '        *err_msg = const_cast<char*>(msg.data());\n'
                 '        *err_msg_len = msg.size();\n'
                 '    }\n'
+                '    return 0;\n'
                 '}\n\n')
             ),
     MemFun(Obj('DataConstRef', 'value'), 'get_item_by_index', [Sizet('i')], True,
@@ -480,6 +481,7 @@ def __copy__(self) -> 'Elements':
                 '        *err_msg = const_cast<char*>(msg.data());\n'
                 '        *err_msg_len = msg.size();\n'
                 '    }\n'
+                '    return 0;\n'
                 '}\n\n')
             ),
     OverloadSet('get_item', [
@@ -503,6 +505,7 @@ def __copy__(self) -> 'Elements':
                 '        *err_msg = const_cast<char*>(msg.data());\n'
                 '        *err_msg_len = msg.size();\n'
                 '    }\n'
+                '    return 0;\n'
                 '}\n\n')
             ),
     MemFun(VecSizet('shp'), 'shape', [], True),
@@ -623,6 +626,7 @@ def __copy__(self) -> 'Elements':
                 '        *err_msg = const_cast<char*>(msg.data());\n'
                 '        *err_msg_len = msg.size();\n'
                 '    }\n'
+                '    return 0;\n'
                 '}\n\n')
             ),
     MemFun(Obj('Data', 'value'), 'get_item_by_index', [Sizet('i')], True,
@@ -652,6 +656,7 @@ def __copy__(self) -> 'Elements':
                 '        *err_msg = const_cast<char*>(msg.data());\n'
                 '        *err_msg_len = msg.size();\n'
                 '    }\n'
+                '    return 0;\n'
                 '}\n\n')
             ),
     OverloadSet('get_item', [
@@ -743,6 +748,7 @@ def __copy__(self) -> 'Elements':
                 '        *err_msg = const_cast<char*>(msg.data());\n'
                 '        *err_msg_len = msg.size();\n'
                 '    }\n'
+                '    return 0;\n'
                 '}\n\n')
             ),
     ])

From 272617d656600ecbe1d78473a14375730fd2c0be Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 3 Nov 2022 11:34:08 +0100
Subject: [PATCH 064/183] Fix some compiler warnings

- Add virtual destructor for TcpTransportServer (fixes a
  -Wdelete-abstract-non-virtual-dtor)
- Add std::move to avoid copying data (fixes a few -Wreturn-std-move)
---
 libmuscle/cpp/src/libmuscle/data.cpp                     | 2 +-
 libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp | 2 +-
 libmuscle/cpp/src/libmuscle/mcp/transport_server.hpp     | 4 ++++
 libmuscle/cpp/src/libmuscle/mpp_message.cpp              | 2 +-
 4 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/libmuscle/cpp/src/libmuscle/data.cpp b/libmuscle/cpp/src/libmuscle/data.cpp
index 53b57694..666a8423 100644
--- a/libmuscle/cpp/src/libmuscle/data.cpp
+++ b/libmuscle/cpp/src/libmuscle/data.cpp
@@ -962,7 +962,7 @@ DataConstRef DataConstRef::grid_data_<bool>(
         Data result = Data::byte_array(num_elems);
         char * data_copy = result.as_byte_array();
         std::copy(data, data + num_elems, data_copy);
-        return result;
+        return std::move(result);
     }
 }
 
diff --git a/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp b/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp
index 959737d2..c0e95b90 100644
--- a/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp
+++ b/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp
@@ -129,7 +129,7 @@ DataConstRef TcpTransportClient::call(
     int64_t length = recv_int64(socket_fd_);
     auto result = Data::byte_array(length);
     recv_all(socket_fd_, result.as_byte_array(), result.size());
-    return result;
+    return std::move(result);
 }
 
 void TcpTransportClient::close() {
diff --git a/libmuscle/cpp/src/libmuscle/mcp/transport_server.hpp b/libmuscle/cpp/src/libmuscle/mcp/transport_server.hpp
index c37a61d0..32ada05d 100644
--- a/libmuscle/cpp/src/libmuscle/mcp/transport_server.hpp
+++ b/libmuscle/cpp/src/libmuscle/mcp/transport_server.hpp
@@ -66,6 +66,10 @@ class TransportServer {
          */
         TransportServer(RequestHandler & handler);
 
+        /** Destroy the Transport Server object
+         */
+        virtual ~TransportServer() = default;
+
         /** Returns the location this server listens on.
          *
          * @return A string containing the location.
diff --git a/libmuscle/cpp/src/libmuscle/mpp_message.cpp b/libmuscle/cpp/src/libmuscle/mpp_message.cpp
index 2962e31c..5f796224 100644
--- a/libmuscle/cpp/src/libmuscle/mpp_message.cpp
+++ b/libmuscle/cpp/src/libmuscle/mpp_message.cpp
@@ -76,7 +76,7 @@ DataConstRef MPPMessage::encoded() const {
     auto bytes = Data::byte_array(sbuf.size());
     memcpy(bytes.as_byte_array(), sbuf.data(), sbuf.size());
 
-    return bytes;
+    return std::move(bytes);
 }
 
 } }

From b4d74c79e0832ae43caa4a012d94b5224bb1e891 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 7 Nov 2022 16:01:01 +0100
Subject: [PATCH 065/183] Fixes #126. See issue for more details.

---
 .../examples/rd_implementations.ymmsl.in      | 20 +++++++++----------
 integration_test/test_start_all.py            |  2 +-
 integration_test/test_start_mpi.py            |  4 ++--
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/docs/source/examples/rd_implementations.ymmsl.in b/docs/source/examples/rd_implementations.ymmsl.in
index c6cef229..4f2b0c7b 100644
--- a/docs/source/examples/rd_implementations.ymmsl.in
+++ b/docs/source/examples/rd_implementations.ymmsl.in
@@ -8,23 +8,23 @@ implementations:
 
   reaction_cpp:
     env:
-      +LD_LIBRARY_PATH: MUSCLE3_HOME/lib
+      +LD_LIBRARY_PATH: :MUSCLE3_HOME/lib
     executable: MUSCLE3_EXAMPLES/cpp/build/reaction
 
   reaction_cpp_mpi:
     env:
-      +LD_LIBRARY_PATH: MUSCLE3_HOME/lib
+      +LD_LIBRARY_PATH: :MUSCLE3_HOME/lib
     executable: MUSCLE3_EXAMPLES/cpp/build/reaction_mpi
     execution_model: openmpi
 
   reaction_fortran:
     env:
-      +LD_LIBRARY_PATH: MUSCLE3_HOME/lib
+      +LD_LIBRARY_PATH: :MUSCLE3_HOME/lib
     executable: MUSCLE3_EXAMPLES/fortran/build/reaction
 
   reaction_fortran_mpi:
     env:
-      +LD_LIBRARY_PATH: MUSCLE3_HOME/lib
+      +LD_LIBRARY_PATH: :MUSCLE3_HOME/lib
     executable: MUSCLE3_EXAMPLES/fortran/build/reaction_mpi
     execution_model: openmpi
 
@@ -35,30 +35,30 @@ implementations:
 
   diffusion_cpp:
     env:
-      +LD_LIBRARY_PATH: MUSCLE3_HOME/lib
+      +LD_LIBRARY_PATH: :MUSCLE3_HOME/lib
     executable: MUSCLE3_EXAMPLES/cpp/build/diffusion
 
   diffusion_fortran:
     env:
-      +LD_LIBRARY_PATH: MUSCLE3_HOME/lib
+      +LD_LIBRARY_PATH: :MUSCLE3_HOME/lib
     executable: MUSCLE3_EXAMPLES/fortran/build/diffusion
 
   mc_driver_cpp:
     env:
-      +LD_LIBRARY_PATH: MUSCLE3_HOME/lib
+      +LD_LIBRARY_PATH: :MUSCLE3_HOME/lib
     executable: MUSCLE3_EXAMPLES/cpp/build/mc_driver
 
   mc_driver_fortran:
     env:
-      +LD_LIBRARY_PATH: MUSCLE3_HOME/lib
+      +LD_LIBRARY_PATH: :MUSCLE3_HOME/lib
     executable: MUSCLE3_EXAMPLES/fortran/build/mc_driver
 
   load_balancer_cpp:
     env:
-      +LD_LIBRARY_PATH: MUSCLE3_HOME/lib
+      +LD_LIBRARY_PATH: :MUSCLE3_HOME/lib
     executable: MUSCLE3_EXAMPLES/cpp/build/load_balancer
 
   load_balancer_fortran:
     env:
-      +LD_LIBRARY_PATH: MUSCLE3_HOME/lib
+      +LD_LIBRARY_PATH: :MUSCLE3_HOME/lib
     executable: MUSCLE3_EXAMPLES/fortran/build/load_balancer
diff --git a/integration_test/test_start_all.py b/integration_test/test_start_all.py
index bb8d03b2..e2cb813b 100644
--- a/integration_test/test_start_all.py
+++ b/integration_test/test_start_all.py
@@ -42,7 +42,7 @@ def test_start_all(tmpdir):
             'implementations:\n'
             '  component:\n'
             '    env:\n'
-            '      +LD_LIBRARY_PATH: {}\n'
+            '      +LD_LIBRARY_PATH: :{}\n'
             '    executable: {}\n'
             'resources:\n'
             '  macro:\n'
diff --git a/integration_test/test_start_mpi.py b/integration_test/test_start_mpi.py
index 233cd210..dca5c9d2 100644
--- a/integration_test/test_start_mpi.py
+++ b/integration_test/test_start_mpi.py
@@ -49,11 +49,11 @@ def test_start_mpi(tmpdir):
             'implementations:\n'
             '  component:\n'
             '    env:\n'
-            '      +LD_LIBRARY_PATH: {}\n'
+            '      +LD_LIBRARY_PATH: :{}\n'
             '    executable: {}\n'
             '  mpi_component:\n'
             '    env:\n'
-            '      +LD_LIBRARY_PATH: {}\n'
+            '      +LD_LIBRARY_PATH: :{}\n'
             '    executable: {}\n'
             '    execution_model: openmpi\n'
             'resources:\n'

From 6587894ad11df1679aa9f974e0758aa88255b9f6 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Wed, 9 Nov 2022 11:41:48 +0100
Subject: [PATCH 066/183] Fix missing function call

Nice catch of mypy!
---
 libmuscle/python/libmuscle/instance.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index 513018d6..ae952111 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -502,13 +502,13 @@ def __receive_message(
         else:
             msg = self._communicator.receive_message(
                     port_name, slot, default)
-            if port.is_connected and not port.is_open(slot):
+            if port.is_connected() and not port.is_open(slot):
                 err_msg = (('Port {} was closed while trying to'
                             ' receive on it, did the peer crash?'
                             ).format(port_name))
                 self.__shutdown(err_msg)
                 raise RuntimeError(err_msg)
-            if port.is_connected and not with_settings:
+            if port.is_connected() and not with_settings:
                 self.__check_compatibility(port_name, msg.settings)
             if not with_settings:
                 msg.settings = None

From 50251724db1c060bc226f9f96f1b0486a93d601d Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 10 Nov 2022 14:25:44 +0100
Subject: [PATCH 067/183] Update call signatures to new design

Skip failing unit tests.
---
 integration_test/test_snapshot_macro_micro.py | 58 ++++++++--------
 .../python/libmuscle/checkpoint_triggers.py   | 23 ++-----
 libmuscle/python/libmuscle/instance.py        | 68 ++++++++-----------
 .../python/libmuscle/snapshot_manager.py      | 23 +++++--
 .../test/test_checkpoint_triggers.py          |  2 +
 .../libmuscle/test/test_snapshot_manager.py   |  2 +
 6 files changed, 86 insertions(+), 90 deletions(-)

diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py
index 93427098..ae657b5b 100644
--- a/integration_test/test_snapshot_macro_micro.py
+++ b/integration_test/test_snapshot_macro_micro.py
@@ -17,26 +17,24 @@ def macro():
             Operator.S: ['s']})
 
     while instance.reuse_instance():
-        t_cur = instance.get_setting('t0', 'float')
         dt = instance.get_setting('dt', 'float')
         t_max = instance.get_setting('t_max', 'float')
 
         if instance.resuming():
             msg = instance.load_snapshot()
+            # load state from message
             t_cur = msg.timestamp
-            assert msg.next_timestamp == pytest.approx(t_cur + dt)
             i = msg.data
-            assert i >= 0
-        else:
+            assert i >= 1
+
+        if instance.should_init():
+            t_cur = instance.get_setting('t0', 'float')
             i = 0
 
         while t_cur + dt <= t_max:
             t_next = t_cur + dt
-
-            if instance.should_save_snapshot(t_cur, t_next):
-                instance.save_snapshot(Message(t_cur, t_next, i))
-
-            t_next = None if t_next + dt > t_max else t_next
+            if t_next + dt > t_max:
+                t_next = None  # final iteration of this time-integration loop
             instance.send('o_i', Message(t_cur, t_next, i))
 
             msg = instance.receive('s')
@@ -45,6 +43,9 @@ def macro():
             i += 1
             t_cur += dt
 
+            if instance.should_save_snapshot(t_cur):
+                instance.save_snapshot(Message(t_cur, None, i))
+
         if instance.should_save_final_snapshot(t_cur):
             instance.save_final_snapshot(Message(t_cur, None, i))
 
@@ -55,26 +56,24 @@ def macro_vector():
             Operator.S: ['s[]']})
 
     while instance.reuse_instance():
-        t_cur = instance.get_setting('t0', 'float')
         dt = instance.get_setting('dt', 'float')
         t_max = instance.get_setting('t_max', 'float')
 
         if instance.resuming():
             msg = instance.load_snapshot()
+            # load state from message
             t_cur = msg.timestamp
-            assert msg.next_timestamp == pytest.approx(t_cur + dt)
             i = msg.data
-            assert i >= 0
-        else:
+            assert i >= 1
+
+        if instance.should_init():
+            t_cur = instance.get_setting('t0', 'float')
             i = 0
 
         while t_cur + dt <= t_max:
             t_next = t_cur + dt
-
-            if instance.should_save_snapshot(t_cur, t_next):
-                instance.save_snapshot(Message(t_cur, t_next, i))
-
-            t_next = None if t_next + dt > t_max else t_next
+            if t_next + dt > t_max:
+                t_next = None  # final iteration of this time-integration loop
             for slot in range(instance.get_port_length('o_i')):
                 instance.send('o_i', Message(t_cur, t_next, i), slot)
 
@@ -85,7 +84,10 @@ def macro_vector():
             i += 1
             t_cur += dt
 
-        if instance.should_save_final_snapshot(t_cur):
+            if instance.should_save_snapshot(t_cur):
+                instance.save_snapshot(Message(t_cur, None, i))
+
+        if instance.should_save_final_snapshot():
             instance.save_final_snapshot(Message(t_cur, None, i))
 
 
@@ -102,25 +104,25 @@ def micro():
             msg = instance.load_snapshot()
             t_cur = msg.timestamp
             i, t_stop = msg.data
-        else:
+
+        if instance.should_init():
             msg = instance.receive('f_i')
             t_cur = msg.timestamp
             i = msg.data
             t_stop = t_cur + t_max
 
         while t_cur < t_stop:
-            t_next = t_cur + dt
-
-            if instance.should_save_snapshot(t_cur, t_next):
-                instance.save_snapshot(Message(t_cur, t_next, [i, t_stop]))
-
+            # faux time-integration for testing snapshots
             t_cur += dt
 
-        if instance.should_save_final_snapshot(t_cur):
-            instance.save_final_snapshot(Message(t_cur, None, [i, t_stop]))
+            if instance.should_save_snapshot(t_cur):
+                instance.save_snapshot(Message(t_cur, None, [i, t_stop]))
 
         instance.send('o_f', Message(t_cur, None, i))
 
+        if instance.should_save_final_snapshot():
+            instance.save_final_snapshot(Message(t_cur, None, [i, t_stop]))
+
 
 @pytest.fixture
 def base_config():
@@ -163,6 +165,7 @@ def base_config():
   - every: 0.4""")
 
 
+@pytest.mark.skip("To be updated")
 def test_snapshot_macro_micro(tmp_path, base_config):
     base_config.check_consistent()
     run_dir1 = RunDir(tmp_path / 'run1')
@@ -203,6 +206,7 @@ def test_snapshot_macro_micro(tmp_path, base_config):
     assert len(snapshots_ymmsl) == 2
 
 
+@pytest.mark.skip("To be updated")
 def test_snapshot_macro_vector_micro(tmp_path, base_config):
     macro_implementation = base_config.implementations['macro_implementation']
     macro_implementation.args[-1] = 'macro_vector'
diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py
index 88d47553..b134f76d 100644
--- a/libmuscle/python/libmuscle/checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/checkpoint_triggers.py
@@ -215,8 +215,7 @@ def elapsed_walltime(self) -> float:
         """
         return time.monotonic() - self._monotonic_reference
 
-    def should_save_snapshot(self, timestamp: float,
-                             next_timestamp: Optional[float]) -> bool:
+    def should_save_snapshot(self, timestamp: float) -> bool:
         """Handles instance.should_save_snapshot
         """
         if self._should_have_saved:
@@ -224,19 +223,12 @@ def should_save_snapshot(self, timestamp: float,
                               '"should_save_final_snapshot" returned positive'
                               ' but no snapshot was saved before the next call')
 
-        value = False
         elapsed_walltime = self.elapsed_walltime()
-        if next_timestamp is None:
-            _logger.warning('No "next_timestamp" provided. Workflow may not'
-                            ' be able to create a consistent snapshot. See '
-                            'https://muscle3.readthedocs.io/en/latest/checkpoints.html')
-            value = self.__should_save(elapsed_walltime, timestamp)
-        else:
-            value = self.__should_save(elapsed_walltime, next_timestamp)
+        value = self.__should_save(elapsed_walltime, timestamp)
         self._should_have_saved = value
         return value
 
-    def should_save_final_snapshot(self, timestamp: float) -> bool:
+    def should_save_final_snapshot(self) -> bool:
         """Handles instance.should_save_final_snapshot
         """
         if self._should_have_saved:
@@ -285,9 +277,7 @@ def reuse_instance(self, max_f_init_next_timestamp: Optional[float]
             self._should_save_final_called = False
             self._saved_final_checkpoint = False
 
-    def update_checkpoints(self, timestamp: float,
-                           next_timestamp: Optional[float], final: bool
-                           ) -> None:
+    def update_checkpoints(self, timestamp: float, final: bool) -> None:
         """Update last and next checkpoint times when a snapshot is made
 
         Args:
@@ -300,10 +290,7 @@ def update_checkpoints(self, timestamp: float,
         if final and self._max_f_init_next_timestamp is not None:
             simulation_time = self._max_f_init_next_timestamp
         else:
-            if next_timestamp is None:
-                simulation_time = timestamp
-            else:
-                simulation_time = next_timestamp
+            simulation_time = timestamp
         self._prevsim = simulation_time
         self._nextsim = self._sim.next_checkpoint(simulation_time)
 
diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index e31ab594..755503bc 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -430,7 +430,7 @@ def resuming(self) -> bool:
         This method returns True for the first iteration of the reuse loop after
         resuming from a previously taken snapshot. When resuming from a
         snapshot, the submodel must load its state from the snapshot as returned
-        by :meth:`load_snapshot` and the F_INIT step must be skipped.
+        by :meth:`load_snapshot`.
 
         Returns:
             True iff the submodel must resume from a snapshot instead of the
@@ -438,6 +438,20 @@ def resuming(self) -> bool:
         """
         return self._snapshot_manager.resuming()
 
+    def should_init(self) -> bool:
+        """Check if this instance should initialize.
+
+        Must be used by submodels that implement the checkpointing API.
+
+        When resuming from a previous snapshot, instances need not always
+        execute the F_INIT phase of the submodel execution loop. Use this method
+        before attempting to receive data on F_INIT ports.
+
+        Returns:
+            True iff the submodel must skip the F_INIT step
+        """
+        return self._snapshot_manager.should_init()
+
     def load_snapshot(self) -> Message:
         """Load a snapshot.
 
@@ -452,42 +466,27 @@ def load_snapshot(self) -> Message:
         """
         return self._snapshot_manager.load_snapshot()
 
-    def should_save_snapshot(
-            self, timestamp: float, next_timestamp: Optional[float]) -> bool:
+    def should_save_snapshot(self, timestamp: float) -> bool:
         """Check if a snapshot should be saved inside a time-integration loop.
 
         This method checks if a snapshot should be saved right now, based on the
-        provided timestamps and passed wallclock time.
-
-        When the next timestamp is provided, this value will be used to
-        determine if a checkpoint will be passed between now and the next time
-        step. A submodel should always provide the next timestamp if available,
-        since this is the most reliable way to get consistent snapshots across
-        all submodels in the run.
-
-        When a submodel cannot provide the next timestamp, a best efford is made
-        to get consistent snapshots (based on the current timestamp). See the
-        checkpointing tutorial for more information.
+        provided timestamp and passed wallclock time.
 
         When this method returns True, the submodel must also save a snapshot
         through :meth:`save_snapshot`. A RuntimeError will be generated when not
         doing so.
 
         See also :meth:`should_save_final_snapshot` for the variant that must be
-        called at the end of a time-integration loop, or when a submodel does
-        not have a time-integration loop.
+        called at the end of the reuse loop.
 
         Args:
             timestamp: current timestamp of the submodel
-            next_timestamp: timestamp of the next iteration of the time
-                integration loop of the submodel or ``None`` if not available
 
         Returns:
             True iff a snapshot should be taken by the submodel according to the
             checkpoint rules provided in the ymmsl configuration.
         """
-        return self._snapshot_manager.should_save_snapshot(
-                timestamp, next_timestamp)
+        return self._snapshot_manager.should_save_snapshot(timestamp)
 
     def save_snapshot(self, message: Message) -> None:
         """Save a snapshot inside a time-integration loop.
@@ -495,8 +494,8 @@ def save_snapshot(self, message: Message) -> None:
         Before saving a snapshot, you should check using
         :meth:`should_save_snapshot` if a snapshot should be saved according to
         the checkpoint rules specified in the ymmsl configuration. You should
-        use the same timestamp and next_timestamp in the provided Message object
-        as used to query `should_save_snapshot`.
+        use the same timestamp in the provided Message object as used to query
+        `should_save_snapshot`.
 
         Although it is allowed to save a snapshot even when
         :meth:`should_save_snapshot` returns False, you should avoid this: this
@@ -505,18 +504,17 @@ def save_snapshot(self, message: Message) -> None:
         It could also lead to a lot of snapshot files clogging your file system.
 
         See also :meth:`save_final_snapshot` for the variant that must be called
-        at the end of a time-integration loop, or when a submodel does not have
-        a time-integration loop.
+        at the end of the reuse loop.
 
         Args:
             message: Message object that is saved as snapshot. The message
-                timestamp and next_timestamp attributes should be the same as
-                passed to :meth:`should_save_snapshot`. The data attribute can
-                be used to store the internal state of the submodel.
+                timestamp attribute should be the same as passed to
+                :meth:`should_save_snapshot`. The data attribute can be used to
+                store the internal state of the submodel.
         """
         return self._snapshot_manager.save_snapshot(message)
 
-    def should_save_final_snapshot(self, timestamp: float) -> bool:
+    def should_save_final_snapshot(self) -> bool:
         """Check if a snapshot should be saved before O_F.
 
         This method checks if a snapshot should be saved right now, based on the
@@ -529,14 +527,11 @@ def should_save_final_snapshot(self, timestamp: float) -> bool:
         See also :meth:`should_save_snapshot` for the variant that may be called
         inside of a time-integration loop of the submodel.
 
-        Args:
-            timestamp: current timestamp of the submodel
-
         Returns:
             True iff a final snapshot should be taken by the submodel according
             to the checkpoint rules provided in the ymmsl configuration.
         """
-        return self._snapshot_manager.should_save_final_snapshot(timestamp)
+        return self._snapshot_manager.should_save_final_snapshot()
 
     def save_final_snapshot(self, message: Message) -> None:
         """Save a snapshot before O_F.
@@ -544,8 +539,6 @@ def save_final_snapshot(self, message: Message) -> None:
         Before saving a snapshot, you should check using
         :meth:`should_save_final_snapshot` if a snapshot should be saved
         according to the checkpoint rules specified in the ymmsl configuration.
-        You should use the same timestamp in the provided Message object as used
-        to query `should_save_final_snapshot`.
 
         Although it is allowed to save a snapshot even when
         :meth:`should_save_final_snapshot` returns False, you should avoid this:
@@ -557,10 +550,9 @@ def save_final_snapshot(self, message: Message) -> None:
         of a time-integration loop of the submodel.
 
         Args:
-            message: Message object that is saved as snapshot. The message
-                timestamp should be the same as passed to
-                :meth:`should_save_snapshot`. The data attribute can be used to
-                store the internal state of the submodel.
+            message: Message object that is saved as snapshot. The data
+                attribute can be used to store the internal state of the
+                submodel.
         """
         return self._snapshot_manager.save_final_snapshot(message)
 
diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py
index 10f2c9fc..aa9f5dc6 100644
--- a/libmuscle/python/libmuscle/snapshot_manager.py
+++ b/libmuscle/python/libmuscle/snapshot_manager.py
@@ -70,7 +70,6 @@ def set_checkpoint_info(self,
             if self._trigger:
                 self._trigger.update_checkpoints(
                     snapshot.message.timestamp,
-                    snapshot.message.next_timestamp,
                     snapshot.is_final_snapshot)
 
     def reuse_instance(self,
@@ -99,6 +98,17 @@ def resuming(self) -> bool:
         """
         return self._resume_from_snapshot is not None
 
+    def should_init(self) -> bool:
+        """Check if F_INIT should be run in this reuse loop.
+
+        Returns:
+            True: when not resuming this reuse loop, or when resuming from a
+                final snapshot.
+            False: otherwise
+        """
+        return (self._resume_from_snapshot is None or
+                self._resume_from_snapshot.is_final_snapshot)
+
     def load_snapshot(self) -> Message:
         """Get the Message to resume from
         """
@@ -107,20 +117,19 @@ def load_snapshot(self) -> Message:
                                ' to check if a snapshot is available')
         return self._resume_from_snapshot.message
 
-    def should_save_snapshot(self, timestamp: float,
-                             next_timestamp: Optional[float]) -> bool:
+    def should_save_snapshot(self, timestamp: float) -> bool:
         """See :meth:`TriggerManager.should_save_snapshot`
         """
         if self._trigger is None:
             return False  # checkpointing disabled
-        return self._trigger.should_save_snapshot(timestamp, next_timestamp)
+        return self._trigger.should_save_snapshot(timestamp)
 
-    def should_save_final_snapshot(self, timestamp: float) -> bool:
+    def should_save_final_snapshot(self) -> bool:
         """See :meth:`TriggerManager.should_save_final_snapshot`
         """
         if self._trigger is None:
             return False  # checkpointing disabled
-        return self._trigger.should_save_final_snapshot(timestamp)
+        return self._trigger.should_save_final_snapshot()
 
     def save_snapshot(self, msg: Message) -> None:
         """Save snapshot contained in the message object.
@@ -158,7 +167,7 @@ def __save_snapshot(self, msg: Message, final: bool) -> None:
 
         if self._trigger is not None:
             self._trigger.update_checkpoints(
-                msg.timestamp, msg.next_timestamp, final)
+                msg.timestamp, final)
 
     def __load_snapshot(self, snapshot_location: Path) -> None:
         """Load a previously stored snapshot from the filesystem
diff --git a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
index baf0c2c1..873f79d8 100644
--- a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
@@ -147,6 +147,7 @@ def test_trigger_manager_reference_time():
     assert 15.0 < elapsed_walltime <= (15.0 + elapsed_monotonic)
 
 
+@pytest.mark.skip("To be updated")
 def test_trigger_manager():
     reference = datetime.now(timezone.utc)
     trigger_manager = TriggerManager(reference, Checkpoints(
@@ -196,6 +197,7 @@ def test_trigger_manager():
     trigger_manager.reuse_instance(None)
 
 
+@pytest.mark.skip("To be updated")
 def test_trigger_manager_warnings(caplog: pytest.LogCaptureFixture,
                                   monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv("MUSCLE_DISABLE_CHECKPOINT_ERRORS", "1")
diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
index f1c18ec8..bfee09a1 100644
--- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py
+++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
@@ -11,6 +11,7 @@
 from libmuscle.snapshot_manager import SnapshotManager
 
 
+@pytest.mark.skip("To be updated")
 def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path
                           ) -> None:
     manager = MagicMock()
@@ -33,6 +34,7 @@ def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path
         assert "no checkpoints" in caplog.records[0].message
 
 
+@pytest.mark.skip("To be updated")
 def test_save_load_checkpoint(tmp_path: Path) -> None:
     manager = MagicMock()
     communicator = MagicMock()

From 60b8aa961006f79acb0fe3c7f16710b6a0cfecaa Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 14 Nov 2022 13:30:52 +0100
Subject: [PATCH 068/183] Refactor reuse_instance logic

- Execute as part of should_save_final_snapshot
- Check in Instance.reuse_instance if already called & call otherwise
---
 .../python/libmuscle/checkpoint_triggers.py   |   9 +-
 libmuscle/python/libmuscle/instance.py        | 117 +++++++++++-------
 .../python/libmuscle/snapshot_manager.py      |  16 ++-
 3 files changed, 83 insertions(+), 59 deletions(-)

diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py
index b134f76d..51bd7848 100644
--- a/libmuscle/python/libmuscle/checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/checkpoint_triggers.py
@@ -252,16 +252,9 @@ def should_save_final_snapshot(self) -> bool:
         self._should_save_final_called = True
         return value
 
-    def reuse_instance(self, max_f_init_next_timestamp: Optional[float]
-                       ) -> None:
+    def reuse_instance(self) -> None:
         """Cleanup between instance reuse
-
-        Args:
-            max_f_init_next_timestamp: the maximum next_timestamp of all
-                messages pre--received during F_INIT.
         """
-        self._max_f_init_next_timestamp = max_f_init_next_timestamp
-
         if self._first_reuse:
             self._first_reuse = False
         else:
diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index 755503bc..7e2559d9 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -74,6 +74,8 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None
 
         self._first_run = True
         """Keeps track of whether this is the first reuse run."""
+        self._do_reuse = None           # type: Optional[bool]
+        """Caching variable for result from :meth:`__check_reuse_instance`"""
 
         self._f_init_cache = dict()     # type: _FInitCacheType
 
@@ -129,37 +131,12 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool:
                 :meth:`should_save_final_snapshot` and
                 :meth:`save_final_snapshot`, or the checkpointing tutorial.
         """
-        do_reuse = self.__receive_settings()
-
-        # TODO: _f_init_cache should be empty here, or the user didn't
-        # receive something that was sent on the last go-around.
-        # At least emit a warning.
-        if not (self.resuming() and self._first_run):
-            # when resuming we skip receiving on f_init in the first run
-            self.__pre_receive_f_init(apply_overlay)
-
-        self._set_local_log_level()
-        self._set_remote_log_level()
-
-        ports = self._communicator.list_ports()
-        f_init_not_connected = all(
-                [not self.is_connected(port)
-                 for port in ports.get(Operator.F_INIT, [])])
-        no_settings_in = not self._communicator.settings_in_connected()
-
-        if f_init_not_connected and no_settings_in:
-            do_reuse = self._first_run
-        else:
-            for message in self._f_init_cache.values():
-                if isinstance(message.data, ClosePort):
-                    do_reuse = False
-        self._first_run = False
+        do_reuse = self._do_reuse
+        if do_reuse is None:
+            # should_save_final_snapshot not called, so we need to check_reuse
+            do_reuse = self.__check_reuse_instance(apply_overlay)
+        self._do_reuse = None
 
-        max_f_init_next_timestamp = max(
-                (msg.next_timestamp
-                 for msg in self._f_init_cache.values()
-                 if msg.next_timestamp is not None),
-                default=None)
         # Note: muscle_snapshot_directory setting is provided by muscle_manager
         # when checkpointing is enabled for this run. When checkpointing is not
         # enabled, it might not exist and a KeyError is raised.
@@ -168,14 +145,8 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool:
             snapshot_path = Path(snapshot_dir)
         except KeyError:
             snapshot_path = None
-        self._snapshot_manager.reuse_instance(
-                max_f_init_next_timestamp, snapshot_path)
+        self._snapshot_manager.reuse_instance(snapshot_path)
 
-        if not do_reuse:
-            self.__close_ports()
-            self._communicator.shutdown()
-            self._deregister()
-            self.__manager.close()
         return do_reuse
 
     def error_shutdown(self, message: str) -> None:
@@ -514,7 +485,7 @@ def save_snapshot(self, message: Message) -> None:
         """
         return self._snapshot_manager.save_snapshot(message)
 
-    def should_save_final_snapshot(self) -> bool:
+    def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool:
         """Check if a snapshot should be saved before O_F.
 
         This method checks if a snapshot should be saved right now, based on the
@@ -527,11 +498,32 @@ def should_save_final_snapshot(self) -> bool:
         See also :meth:`should_save_snapshot` for the variant that may be called
         inside of a time-integration loop of the submodel.
 
+        .. note::
+            This method will block until it can determine whether a final
+            snapshot should be taken. This means it must also determine if this
+            instance is reused. The optional keword-only argument
+            `apply_overlay` has the same meaning as for :meth:`reuse_instance`.
+
+        Args:
+            apply_overlay: Whether to apply the received settings
+                overlay or to save it. If you're going to use
+                :meth:`receive_with_settings` on your F_INIT ports, set this to
+                False. If you don't know what that means, just call
+                `reuse_instance()` without specifying this and everything will
+                be fine. If it turns out that you did need to specify False,
+                MUSCLE3 will tell you about it in an error message and you can
+                add it still.
+
         Returns:
             True iff a final snapshot should be taken by the submodel according
             to the checkpoint rules provided in the ymmsl configuration.
         """
-        return self._snapshot_manager.should_save_final_snapshot()
+        self._do_reuse = self.__check_reuse_instance(apply_overlay)
+        f_init_max_timestamp = max(
+                (msg.timestamp for msg in self._f_init_cache.values()),
+                default=None)
+        return self._snapshot_manager.should_save_final_snapshot(
+                self._do_reuse, f_init_max_timestamp)
 
     def save_final_snapshot(self, message: Message) -> None:
         """Save a snapshot before O_F.
@@ -632,6 +624,46 @@ def __set_up_logging(self) -> None:
                                                      self.__manager)
             logging.getLogger().addHandler(self._mmp_handler)
 
+    def __check_reuse_instance(self, apply_overlay: bool) -> bool:
+        """Pre-receive F_INIT messages and detect if this instance is reused.
+
+        This is called during :meth:`should_save_final_snapshot` to detect if a
+        snapshot must be taken. If an instance does implement checkpointing,
+        :meth:`reuse_instance` will call it instead.
+        """
+        do_reuse = self.__receive_settings()
+
+        # TODO: _f_init_cache should be empty here, or the user didn't
+        # receive something that was sent on the last go-around.
+        # At least emit a warning.
+        if not (self.resuming() and self._first_run):
+            # when resuming we skip receiving on f_init in the first run
+            self.__pre_receive_f_init(apply_overlay)
+
+        self._set_local_log_level()
+        self._set_remote_log_level()
+
+        ports = self._communicator.list_ports()
+        f_init_not_connected = all(
+                [not self.is_connected(port)
+                 for port in ports.get(Operator.F_INIT, [])])
+        no_settings_in = not self._communicator.settings_in_connected()
+
+        if f_init_not_connected and no_settings_in:
+            do_reuse = self._first_run
+        else:
+            for message in self._f_init_cache.values():
+                if isinstance(message.data, ClosePort):
+                    do_reuse = False
+        self._first_run = False
+
+        if not do_reuse:
+            self.__close_ports()
+            self._communicator.shutdown()
+            self._deregister()
+            self.__manager.close()
+        return do_reuse
+
     def __receive_message(
             self, port_name: str, slot: Optional[int],
             default: Optional[Message], with_settings: bool
@@ -651,9 +683,10 @@ def __receive_message(
                 if with_settings and msg.settings is None:
                     err_msg = ('If you use receive_with_settings()'
                                ' on an F_INIT port, then you have to'
-                               ' pass False to reuse_instance(),'
-                               ' otherwise the settings will already'
-                               ' have been applied by MUSCLE.')
+                               ' pass apply_overlay=False to reuse_instance() '
+                               ' and should_save_final_snapshot(),'
+                               ' if applicable, otherwise the settings will'
+                               ' already have been applied by MUSCLE.')
                     self.__shutdown(err_msg)
                     raise RuntimeError(err_msg)
             else:
diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py
index aa9f5dc6..e557070f 100644
--- a/libmuscle/python/libmuscle/snapshot_manager.py
+++ b/libmuscle/python/libmuscle/snapshot_manager.py
@@ -72,19 +72,14 @@ def set_checkpoint_info(self,
                     snapshot.message.timestamp,
                     snapshot.is_final_snapshot)
 
-    def reuse_instance(self,
-                       max_f_init_next_timestamp: Optional[float],
-                       snapshot_directory: Optional[Path],
-                       ) -> None:
+    def reuse_instance(self, snapshot_directory: Optional[Path]) -> None:
         """Callback on Instance.reuse_instance
 
         Args:
-            max_f_init_next_timestamp: maximum next_timestamp of all F_INIT
-                messages. May be None if no message has next_timestamp set or
-                if no F_INIT messages were received.
+            snapshot_directory: Path to store this instance's snapshots in.
         """
         if self._trigger is not None:
-            self._trigger.reuse_instance(max_f_init_next_timestamp)
+            self._trigger.reuse_instance()
 
         self._snapshot_directory = snapshot_directory
 
@@ -124,7 +119,10 @@ def should_save_snapshot(self, timestamp: float) -> bool:
             return False  # checkpointing disabled
         return self._trigger.should_save_snapshot(timestamp)
 
-    def should_save_final_snapshot(self) -> bool:
+    def should_save_final_snapshot(
+            self, do_reuse: bool,
+            f_init_max_timestamp: Optional[float]
+            ) -> bool:
         """See :meth:`TriggerManager.should_save_final_snapshot`
         """
         if self._trigger is None:

From 677693b4a49714719547effb754c50079094bcdc Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 15 Nov 2022 10:58:49 +0100
Subject: [PATCH 069/183] Refactor TriggerManager

It is now always available on SnapshotManager and checks internally if
checkpoints are defined.
---
 .../python/libmuscle/checkpoint_triggers.py   | 33 +++++++++++++++++--
 .../python/libmuscle/snapshot_manager.py      | 33 +++++--------------
 .../test/test_checkpoint_triggers.py          |  9 +++--
 3 files changed, 45 insertions(+), 30 deletions(-)

diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py
index 51bd7848..ff12918b 100644
--- a/libmuscle/python/libmuscle/checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/checkpoint_triggers.py
@@ -187,8 +187,20 @@ class TriggerManager:
     """Manages all checkpoint triggers and checks if a snapshot must be saved.
     """
 
-    def __init__(self, utc_reference: datetime, checkpoints: Checkpoints
-                 ) -> None:
+    def __init__(self) -> None:
+        self._has_checkpoints = False
+        self._last_triggers = []    # type: List[str]
+        self._monotonic_reference = time.monotonic()
+
+    def set_checkpoint_info(
+            self, utc_reference: datetime, checkpoints: Checkpoints) -> None:
+        """Register checkpoint info received from the muscle manager.
+        """
+        if not checkpoints:
+            self._has_checkpoints = False
+            return
+
+        self._has_checkpoints = True
         self._monotonic_reference = _utc_to_monotonic(utc_reference)
 
         self._wall = CombinedCheckpointTriggers(checkpoints.wallclock_time)
@@ -200,7 +212,6 @@ def __init__(self, utc_reference: datetime, checkpoints: Checkpoints
         self._nextsim = None        # type: Optional[float]
         self._sim_reset = True
 
-        self._last_triggers = []    # type: List[str]
         self._first_reuse = True
         self._max_f_init_next_timestamp = None  # type: Optional[float]
 
@@ -218,6 +229,9 @@ def elapsed_walltime(self) -> float:
     def should_save_snapshot(self, timestamp: float) -> bool:
         """Handles instance.should_save_snapshot
         """
+        if not self._has_checkpoints:
+            return False
+
         if self._should_have_saved:
             _checkpoint_error('"should_save_snapshot" or '
                               '"should_save_final_snapshot" returned positive'
@@ -231,6 +245,9 @@ def should_save_snapshot(self, timestamp: float) -> bool:
     def should_save_final_snapshot(self) -> bool:
         """Handles instance.should_save_final_snapshot
         """
+        if not self._has_checkpoints:
+            return False
+
         if self._should_have_saved:
             _checkpoint_error('"should_save_snapshot" or '
                               '"should_save_final_snapshot" returned positive'
@@ -255,6 +272,8 @@ def should_save_final_snapshot(self) -> bool:
     def reuse_instance(self) -> None:
         """Cleanup between instance reuse
         """
+        if not self._has_checkpoints:
+            return
         if self._first_reuse:
             self._first_reuse = False
         else:
@@ -277,6 +296,14 @@ def update_checkpoints(self, timestamp: float, final: bool) -> None:
             timestamp: timestamp as reported by the instance
             next_timestamp: next timestamp as reported by the instance
         """
+        if not self._has_checkpoints:
+            _logger.info('Saving a snapshot, but no snapshots requested by the'
+                         ' workflow. Hint: use Instance.should_save_snapshot(),'
+                         ' Instance.should_save_final_snapshot() or'
+                         ' Instance.snapshots_enabled() to test if it is useful'
+                         ' to save a snapshot.')
+            return
+
         self._prevwall = self.elapsed_walltime()
         self._nextwall = self._wall.next_checkpoint(self._prevwall)
 
diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py
index e557070f..4061f092 100644
--- a/libmuscle/python/libmuscle/snapshot_manager.py
+++ b/libmuscle/python/libmuscle/snapshot_manager.py
@@ -41,8 +41,8 @@ def __init__(self,
         self._manager = manager
 
         self._first_reuse = True
+        self._trigger = TriggerManager()
         self._resume_from_snapshot = None   # type: Optional[Snapshot]
-        self._trigger = None                # type: Optional[TriggerManager]
         self._snapshot_directory = None     # type: Optional[Path]
         self._next_snapshot_num = 1
 
@@ -60,17 +60,15 @@ def set_checkpoint_info(self,
             checkpoints: requested workflow checkpoints
             resume: previous snapshot to resume from (or None if not resuming)
         """
-        if checkpoints:
-            self._trigger = TriggerManager(utc_reference, checkpoints)
+        self._trigger.set_checkpoint_info(utc_reference, checkpoints)
         if resume is not None:
             self.__load_snapshot(resume)
             snapshot = cast(Snapshot, self._resume_from_snapshot)
             self._communicator.restore_message_counts(
                 snapshot.port_message_counts)
-            if self._trigger:
-                self._trigger.update_checkpoints(
-                    snapshot.message.timestamp,
-                    snapshot.is_final_snapshot)
+            self._trigger.update_checkpoints(
+                snapshot.message.timestamp,
+                snapshot.is_final_snapshot)
 
     def reuse_instance(self, snapshot_directory: Optional[Path]) -> None:
         """Callback on Instance.reuse_instance
@@ -78,8 +76,7 @@ def reuse_instance(self, snapshot_directory: Optional[Path]) -> None:
         Args:
             snapshot_directory: Path to store this instance's snapshots in.
         """
-        if self._trigger is not None:
-            self._trigger.reuse_instance()
+        self._trigger.reuse_instance()
 
         self._snapshot_directory = snapshot_directory
 
@@ -115,8 +112,6 @@ def load_snapshot(self) -> Message:
     def should_save_snapshot(self, timestamp: float) -> bool:
         """See :meth:`TriggerManager.should_save_snapshot`
         """
-        if self._trigger is None:
-            return False  # checkpointing disabled
         return self._trigger.should_save_snapshot(timestamp)
 
     def should_save_final_snapshot(
@@ -125,8 +120,6 @@ def should_save_final_snapshot(
             ) -> bool:
         """See :meth:`TriggerManager.should_save_final_snapshot`
         """
-        if self._trigger is None:
-            return False  # checkpointing disabled
         return self._trigger.should_save_final_snapshot()
 
     def save_snapshot(self, msg: Message) -> None:
@@ -146,14 +139,8 @@ def __save_snapshot(self, msg: Message, final: bool) -> None:
             msg: message object representing the snapshot
             final: True iff called from save_final_snapshot
         """
-        if self._trigger is None:
-            _logger.info('Saving a snapshot but no checkpoints requested'
-                         ' by the workflow.')
-            triggers = []
-            wallclock_time = 0.0
-        else:
-            triggers = self._trigger.get_triggers()
-            wallclock_time = self._trigger.elapsed_walltime()
+        triggers = self._trigger.get_triggers()
+        wallclock_time = self._trigger.elapsed_walltime()
 
         port_message_counts = self._communicator.get_message_counts()
         snapshot = MsgPackSnapshot(
@@ -163,9 +150,7 @@ def __save_snapshot(self, msg: Message, final: bool) -> None:
         metadata = SnapshotMetadata.from_snapshot(snapshot, str(path))
         self._manager.submit_snapshot_metadata(self._instance_id, metadata)
 
-        if self._trigger is not None:
-            self._trigger.update_checkpoints(
-                msg.timestamp, final)
+        self._trigger.update_checkpoints(msg.timestamp, final)
 
     def __load_snapshot(self, snapshot_location: Path) -> None:
         """Load a previously stored snapshot from the filesystem
diff --git a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
index 873f79d8..4f6eed0d 100644
--- a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
@@ -141,7 +141,8 @@ def test_trigger_manager_reference_time():
     monotonic_now = time.monotonic()
     utcnow = datetime.now(timezone.utc)
     reference = utcnow - timedelta(seconds=15)
-    trigger_manager = TriggerManager(reference, Checkpoints())
+    trigger_manager = TriggerManager()
+    trigger_manager.set_checkpoint_info(reference, Checkpoints(at_end=True))
     elapsed_walltime = trigger_manager.elapsed_walltime()
     elapsed_monotonic = time.monotonic() - monotonic_now
     assert 15.0 < elapsed_walltime <= (15.0 + elapsed_monotonic)
@@ -150,7 +151,8 @@ def test_trigger_manager_reference_time():
 @pytest.mark.skip("To be updated")
 def test_trigger_manager():
     reference = datetime.now(timezone.utc)
-    trigger_manager = TriggerManager(reference, Checkpoints(
+    trigger_manager = TriggerManager()
+    trigger_manager.set_checkpoint_info(reference, Checkpoints(
             wallclock_time=[CheckpointAtRule([1e-12])],
             simulation_time=[CheckpointAtRule([1, 3, 5])]))
 
@@ -203,7 +205,8 @@ def test_trigger_manager_warnings(caplog: pytest.LogCaptureFixture,
     monkeypatch.setenv("MUSCLE_DISABLE_CHECKPOINT_ERRORS", "1")
 
     reference = datetime.now(timezone.utc)
-    trigger_manager = TriggerManager(reference, Checkpoints(
+    trigger_manager = TriggerManager()
+    trigger_manager.set_checkpoint_info(reference, Checkpoints(
             simulation_time=[CheckpointAtRule([1, 3, 5])]))
 
     trigger_manager.reuse_instance(2)

From a8d81bb2969774fc81b270cb917d3f22d5ccbf99 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 15 Nov 2022 13:07:48 +0100
Subject: [PATCH 070/183] Process documentation comments

---
 docs/source/coupling.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/coupling.rst b/docs/source/coupling.rst
index 8c764a97..5fccaf73 100644
--- a/docs/source/coupling.rst
+++ b/docs/source/coupling.rst
@@ -4,8 +4,8 @@ Coupling your model
 Multicast
 ---------
 
-With MUSCLE3 you can connect an output port to multiple input ports. This is
-called multicast. When a submodel sends a message on a port that is connected to
+With MUSCLE3 you can connect an output port to multiple input ports.
+When a submodel sends a message on a port that is connected to
 multiple input ports, the message is copied and sent to each connected port.
 
 .. note::

From 20826eeb279ddc9d02f356a4b4ee935038beebb8 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 15 Nov 2022 13:13:57 +0100
Subject: [PATCH 071/183] Move profiling of multicast messages out for-loop

---
 libmuscle/python/libmuscle/communicator.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py
index cefd5e3b..7e9a7131 100644
--- a/libmuscle/python/libmuscle/communicator.py
+++ b/libmuscle/python/libmuscle/communicator.py
@@ -206,6 +206,8 @@ def send_message(
             return
 
         port = self._ports[port_name]
+        profile_event = self._profiler.start(ProfileEventType.SEND, port,
+                                             None, slot, None)
 
         recv_endpoints = self._peer_manager.get_peer_endpoints(
                 snd_endpoint.port, slot_list)
@@ -215,9 +217,6 @@ def send_message(
             port_length = self._ports[port_name].get_length()
 
         for recv_endpoint in recv_endpoints:
-            profile_event = self._profiler.start(ProfileEventType.SEND, port,
-                                                 None, slot, None)
-
             mcp_message = MPPMessage(snd_endpoint.ref(), recv_endpoint.ref(),
                                      port_length,
                                      message.timestamp, message.next_timestamp,
@@ -226,10 +225,10 @@ def send_message(
             encoded_message = mcp_message.encoded()
             self._post_office.deposit(recv_endpoint.ref(), encoded_message)
 
-            profile_event.stop()
-            if port.is_vector():
-                profile_event.port_length = port.get_length()
-            profile_event.message_size = len(encoded_message)
+        profile_event.stop()
+        if port.is_vector():
+            profile_event.port_length = port.get_length()
+        profile_event.message_size = len(encoded_message)
 
     def receive_message(self, port_name: str, slot: Optional[int] = None,
                         default: Optional[Message] = None

From f0f974068cce1e44457519d38acd214cc80c6ee8 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 15 Nov 2022 13:16:06 +0100
Subject: [PATCH 072/183] Update docstring

---
 libmuscle/python/libmuscle/peer_manager.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libmuscle/python/libmuscle/peer_manager.py b/libmuscle/python/libmuscle/peer_manager.py
index 0a7600c0..a0c28c9a 100644
--- a/libmuscle/python/libmuscle/peer_manager.py
+++ b/libmuscle/python/libmuscle/peer_manager.py
@@ -91,14 +91,14 @@ def get_peer_locations(self, peer_instance: Reference) -> List[str]:
 
     def get_peer_endpoints(self, port: Identifier, slot: List[int]
                            ) -> List[Endpoint]:
-        """Determine the peer endpoint for the given port and slot.
+        """Determine the peer endpoints for the given port and slot.
 
         Args:
             port: The port on our side to send or receive on.
             slot: The slot to send or receive on.
 
         Returns:
-            The peer endpoint.
+            The peer endpoints.
         """
         peers = self.__peers[self.__kernel + port]
         endpoints = []

From e00e1eabd3f366b9aa7f2a185a62cd68cf5ac1eb Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Tue, 15 Nov 2022 13:59:27 +0100
Subject: [PATCH 073/183] Add clang to the CI

---
 .github/workflows/ci_ubuntu_22.04_clang.yaml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 .github/workflows/ci_ubuntu_22.04_clang.yaml

diff --git a/.github/workflows/ci_ubuntu_22.04_clang.yaml b/.github/workflows/ci_ubuntu_22.04_clang.yaml
new file mode 100644
index 00000000..125b3fe6
--- /dev/null
+++ b/.github/workflows/ci_ubuntu_22.04_clang.yaml
@@ -0,0 +1,20 @@
+# Run Continuous Integration for the latest Ubuntu release
+# This mainly checks for issues/regressions in the native build
+name: native_compatibility_ubuntu22.04_clang
+on:
+  schedule:
+    - cron: '0 3 * * 0'
+  push:
+    branches:
+      - 'release-*'
+      - fix_native_compatibility_ci
+      - feature/clang_build
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Run tests on Ubuntu 22.04 with Clang
+      run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:22.04 /bin/bash -c 'apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential clang cmake gfortran git valgrind libopenmpi-dev pkg-config python3 python3-pip python3-venv curl && apt-get -y remove libssl-dev && useradd -m -d /home/muscle3 muscle3 && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && su muscle3 -c -- "pip3 install -U pip setuptools wheel" && su muscle3 -c -- "pip3 install \"ymmsl>=0.12.0,<0.13\" qcg-pilotjob==0.13.1" && su muscle3 -c -- "cd /home/muscle3/muscle3 && CXXFLAGS=-fPIE OMPI_CXX=clang++ CXX=clang++ make test_examples"'

From a0dabc31fe55361b671445efe2834e3e0a2e45f0 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 15 Nov 2022 15:10:57 +0100
Subject: [PATCH 074/183] MMP - get checkpoint info as separate request type

---
 libmuscle/python/libmuscle/instance.py        | 14 +++----
 .../python/libmuscle/manager/mmp_server.py    | 42 ++++++++-----------
 .../manager/test/test_mmp_request_handler.py  | 19 ++++-----
 libmuscle/python/libmuscle/mcp/protocol.py    |  1 +
 libmuscle/python/libmuscle/mmp_client.py      | 27 +++++++-----
 .../python/libmuscle/snapshot_manager.py      | 19 +++++----
 .../python/libmuscle/test/test_instance.py    |  6 +--
 .../libmuscle/test/test_snapshot_manager.py   |  6 +--
 8 files changed, 67 insertions(+), 67 deletions(-)

diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index 7e2559d9..c064e6af 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -1,5 +1,4 @@
 from copy import copy
-from datetime import datetime
 import logging
 import os
 from pathlib import Path
@@ -9,7 +8,7 @@
 from typing_extensions import Literal
 
 from ymmsl import (Identifier, Operator, SettingValue, Port, Reference,
-                   Settings, Checkpoints)
+                   Settings)
 
 from libmuscle.communicator import Communicator, Message
 from libmuscle.settings_manager import SettingsManager
@@ -79,11 +78,11 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None
 
         self._f_init_cache = dict()     # type: _FInitCacheType
 
-        checkpoint_info = self._register()
+        self._register()
         self._connect()
-        # Note: SnapshotManager.set_checkpoint_info needs to have the ports
+        # Note: SnapshotManager.get_checkpoint_info needs to have the ports
         # initialized so it comes after self._connect()
-        self._snapshot_manager.set_checkpoint_info(*checkpoint_info)
+        self._snapshot_manager.get_checkpoint_info()
         self._set_local_log_level()
         self._set_remote_log_level()
 
@@ -548,17 +547,16 @@ def save_final_snapshot(self, message: Message) -> None:
         """
         return self._snapshot_manager.save_final_snapshot(message)
 
-    def _register(self) -> Tuple[datetime, Checkpoints, Optional[Path]]:
+    def _register(self) -> None:
         """Register this instance with the manager.
         """
         register_event = self._profiler.start(ProfileEventType.REGISTER)
         locations = self._communicator.get_locations()
         port_list = self.__list_declared_ports()
-        checkpoint_info = self.__manager.register_instance(
+        self.__manager.register_instance(
                 self._instance_name(), locations, port_list)
         register_event.stop()
         _logger.info('Registered with the manager')
-        return checkpoint_info
 
     def _connect(self) -> None:
         """Connect this instance to the given peers / conduits.
diff --git a/libmuscle/python/libmuscle/manager/mmp_server.py b/libmuscle/python/libmuscle/manager/mmp_server.py
index f5b8b692..9382d0eb 100644
--- a/libmuscle/python/libmuscle/manager/mmp_server.py
+++ b/libmuscle/python/libmuscle/manager/mmp_server.py
@@ -1,7 +1,7 @@
 from datetime import datetime, timezone
 import errno
 import logging
-from typing import Any, Dict, Optional, Tuple, cast, List
+from typing import Any, Dict, cast, List
 
 import msgpack
 from ymmsl import (
@@ -23,8 +23,6 @@
 
 _logger = logging.getLogger(__name__)
 
-_EncodedCheckpointType = Dict[str, List[Dict[str, Any]]]
-
 
 def decode_operator(data: str) -> Operator:
     """Create an Operator from a MsgPack-compatible value."""
@@ -41,9 +39,10 @@ def encode_conduit(conduit: Conduit) -> List[str]:
     return [str(conduit.sender), str(conduit.receiver)]
 
 
-def encode_checkpoints(checkpoints: Checkpoints) -> _EncodedCheckpointType:
+def encode_checkpoints(checkpoints: Checkpoints) -> Dict[str, Any]:
     """Convert a Checkpoins to a MsgPack-compatible value."""
     return {
+        "at_end": checkpoints.at_end,
         "wallclock_time": [vars(rule) for rule in checkpoints.wallclock_time],
         "simulation_time": [vars(rule) for rule in checkpoints.simulation_time]
     }
@@ -100,6 +99,8 @@ def handle_request(self, request: bytes) -> bytes:
             response = self._submit_profile_events(*req_args)
         elif req_type == RequestType.SUBMIT_SNAPSHOT.value:
             response = self._submit_snapshot(*req_args)
+        elif req_type == RequestType.GET_CHECKPOINT_INFO.value:
+            response = self._get_checkpoint_info(*req_args)
 
         return cast(bytes, msgpack.packb(response, use_bin_type=True))
 
@@ -118,13 +119,6 @@ def _register_instance(
             status (ResponseType): SUCCESS or ERROR
             error_msg (str): An error message, only present if status
                 equals ERROR
-            checkpoint_info (Tuple[str, bytes, Optional[str]]): Checkpoint info,
-                only present if status equals SUCCESS. The first item is a tuple
-                encoding of the wallclock reference time (year, month, day,
-                hour, minute, second, microsecond) in UTC. The second item is a
-                dict encoding a ymmsl.Checkpoints object. The final item is the
-                checkpoint filename that the registered instance should resume
-                from, or None if no resume is requested.
         """
         port_objs = [decode_port(p) for p in ports]
         instance = Reference(instance_id)
@@ -132,8 +126,7 @@ def _register_instance(
             self._instance_registry.add(instance, locations, port_objs)
 
             _logger.info(f'Registered instance {instance_id}')
-            checkpoint_info = self._get_checkpoint_info(instance)
-            return [ResponseType.SUCCESS.value, checkpoint_info]
+            return [ResponseType.SUCCESS.value]
         except AlreadyRegistered:
             return [
                     ResponseType.ERROR.value,
@@ -279,28 +272,29 @@ def _submit_snapshot(
         self._snapshot_registry.register_snapshot(instance, snapshot_obj)
         return [ResponseType.SUCCESS.value]
 
-    def _get_checkpoint_info(
-                self,
-                instance: Reference
-                ) -> Tuple[float, _EncodedCheckpointType, Optional[str]]:
+    def _get_checkpoint_info(self, instance_id: str) -> Any:
         """Get checkpoint info for an instance
 
         Args:
             instance: The instance whose checkpoint info to get
 
         Returns:
-            wallclock_reference_time: tuple encoding UTC reference for wallclock
-                time = 0: (year, month, day, hour, minute, second, microsecond)
-            checkpoints: yaml-encoded ymmsl.Checkpoints object
-            resume: path of the snapshot file to resume from (or None if not
-                resuming)
+            A list containing the following values on success:
+
+            status (ResponseType): SUCCESS
+            wallclock_reference_time (float): Unix timestamp (in UTC) indicating
+                wallclock time of the start of the workflow.
+            checkpoints (dict): Dictionary encdoing a ymmsl.Checkpoints object.
+            resume_path (Optional[str]): Checkpoint filename to resume from.
         """
+        instance = Reference(instance_id)
         resume = None
         if instance in self._configuration.resume:
             resume = str(self._configuration.resume[instance])
-        return (self._reference_timestamp,
+        return [ResponseType.SUCCESS.value,
+                self._reference_timestamp,
                 encode_checkpoints(self._configuration.checkpoints),
-                resume)
+                resume]
 
 
 class MMPServer:
diff --git a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
index ac80dca2..89de4068 100644
--- a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
+++ b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
@@ -95,31 +95,28 @@ def test_register_instance(mmp_request_handler, instance_registry):
     assert registered_ports['test_instance'][0].operator == Operator.F_INIT
 
 
-def test_register_instance_checkpoint_info(
-        mmp_configuration, mmp_request_handler):
+def test_get_checkpoint_info(mmp_configuration, mmp_request_handler):
     resume_path = Path('/path/to/resume.pack')
     mmp_configuration.resume = {Reference('test_instance'): resume_path}
-    mmp_configuration.checkpoints = Checkpoints([CheckpointRangeRule(every=10),
-                                                 CheckpointAtRule([1, 2, 3.0])])
+    mmp_configuration.checkpoints = Checkpoints(
+            True,
+            [CheckpointRangeRule(every=10), CheckpointAtRule([1, 2, 3.0])])
 
-    request = [
-            RequestType.REGISTER_INSTANCE.value,
-            'test_instance',
-            ['tcp://localhost:10000'],
-            [['test_in', 'F_INIT']]]
+    request = [RequestType.GET_CHECKPOINT_INFO.value, 'test_instance']
     encoded_request = msgpack.packb(request, use_bin_type=True)
 
     result = mmp_request_handler.handle_request(encoded_request)
     decoded_result = msgpack.unpackb(result, raw=False)
 
     assert decoded_result[0] == ResponseType.SUCCESS.value
-    timestamp, checkpoints, resume = decoded_result[1]
+    timestamp, checkpoints, resume = decoded_result[1:]
 
     ref_time = datetime.fromtimestamp(timestamp, tz=timezone.utc)
     assert ref_time == mmp_request_handler._reference_time
 
     assert isinstance(checkpoints, dict)
-    assert checkpoints.keys() == {'wallclock_time', 'simulation_time'}
+    assert checkpoints.keys() == {'at_end', 'wallclock_time', 'simulation_time'}
+    assert checkpoints['at_end'] is True
     assert checkpoints['simulation_time'] == []
     wallclock_time = checkpoints['wallclock_time']
     assert len(wallclock_time) == 2
diff --git a/libmuscle/python/libmuscle/mcp/protocol.py b/libmuscle/python/libmuscle/mcp/protocol.py
index 06d1c0da..5d1217ed 100644
--- a/libmuscle/python/libmuscle/mcp/protocol.py
+++ b/libmuscle/python/libmuscle/mcp/protocol.py
@@ -21,6 +21,7 @@ class RequestType(Enum):
     SUBMIT_LOG_MESSAGE = 5
     SUBMIT_PROFILE_EVENTS = 6
     SUBMIT_SNAPSHOT = 7
+    GET_CHECKPOINT_INFO = 8
 
     # MUSCLE Peer Protocol
     GET_NEXT_MESSAGE = 21
diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py
index 6a3fe729..37effdca 100644
--- a/libmuscle/python/libmuscle/mmp_client.py
+++ b/libmuscle/python/libmuscle/mmp_client.py
@@ -57,12 +57,12 @@ def decode_checkpoint_rule(rule: Dict[str, Any]) -> CheckpointRule:
         return CheckpointAtRule(**rule)
     if rule.keys() == {'start', 'stop', 'every'}:
         return CheckpointRangeRule(**rule)
-    raise ValueError('Cannot convert {rule} to a checkpoint rule.')
+    raise ValueError(f'Cannot convert {rule} to a checkpoint rule.')
 
 
 def decode_checkpoint_info(
         reference_timestamp: float,
-        checkpoints_dict: Dict[str, List[Dict[str, Any]]],
+        checkpoints_dict: Dict[str, Any],
         resume: Optional[str]
         ) -> Tuple[datetime, Checkpoints, Optional[Path]]:
     """Decode checkpoint info from a MsgPack-compatible value.
@@ -80,6 +80,7 @@ def decode_checkpoint_info(
     """
     ref_time = datetime.fromtimestamp(reference_timestamp, tz=timezone.utc)
     checkpoints = Checkpoints(
+            at_end=checkpoints_dict["at_end"],
             wallclock_time=[decode_checkpoint_rule(rule)
                             for rule in checkpoints_dict["wallclock_time"]],
             simulation_time=[decode_checkpoint_rule(rule)
@@ -161,9 +162,21 @@ def get_settings(self) -> Settings:
         response = self._call_manager(request)
         return Settings(response[1])
 
+    def get_checkpoint_info(self, name: Reference
+                            ) -> Tuple[datetime, Checkpoints, Optional[Path]]:
+        """Get the checkpoint info from the manager.
+
+        Returns:
+            wallclock_time_reference: UTC time where wallclock_time = 0
+            checkpoints: checkpoint configuration
+            resume: path to the resume snapshot
+        """
+        request = [RequestType.GET_CHECKPOINT_INFO.value, str(name)]
+        response = self._call_manager(request)
+        return decode_checkpoint_info(*response[1:])
+
     def register_instance(self, name: Reference, locations: List[str],
-                          ports: List[Port]
-                          ) -> Tuple[datetime, Checkpoints, Optional[Path]]:
+                          ports: List[Port]) -> None:
         """Register a component instance with the manager.
 
         Args:
@@ -171,11 +184,6 @@ def register_instance(self, name: Reference, locations: List[str],
             locations: List of places where the instance can be
                     reached.
             ports: List of ports of this instance.
-
-        Returns:
-            wallclock_time_reference: UTC time where wallclock_time = 0
-            checkpoints: checkpoint configuration
-            resume: path to the resume snapshot
         """
         request = [
                 RequestType.REGISTER_INSTANCE.value,
@@ -185,7 +193,6 @@ def register_instance(self, name: Reference, locations: List[str],
         if response[0] == ResponseType.ERROR.value:
             raise RuntimeError(
                     f'Error registering instance: {response[1]}')
-        return decode_checkpoint_info(*response[1])
 
     def request_peers(
             self, name: Reference) -> Tuple[
diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py
index 4061f092..ffd7e4b2 100644
--- a/libmuscle/python/libmuscle/snapshot_manager.py
+++ b/libmuscle/python/libmuscle/snapshot_manager.py
@@ -46,14 +46,17 @@ def __init__(self,
         self._snapshot_directory = None     # type: Optional[Path]
         self._next_snapshot_num = 1
 
-    def set_checkpoint_info(self,
-                            utc_reference: datetime,
-                            checkpoints: Checkpoints,
-                            resume: Optional[Path]) -> None:
-        """Callback after registering with the manager.
-
-        Provide the snapshot manager with info on workflow checkpoints and if we
-        should resume from a previous snapshot.
+    def get_checkpoint_info(self) -> None:
+        """Request checkpoint info from the muscle manager.
+        """
+        checkpoint_info = self._manager.get_checkpoint_info(self._instance_id)
+        self._set_checkpoint_info(*checkpoint_info)
+
+    def _set_checkpoint_info(self,
+                             utc_reference: datetime,
+                             checkpoints: Checkpoints,
+                             resume: Optional[Path]) -> None:
+        """Apply checkpoint info received from the manager.
 
         Args:
             utc_reference: datetime (in UTC) indicating wallclock_time=0
diff --git a/libmuscle/python/libmuscle/test/test_instance.py b/libmuscle/python/libmuscle/test/test_instance.py
index e8c7f9b0..54044a00 100644
--- a/libmuscle/python/libmuscle/test/test_instance.py
+++ b/libmuscle/python/libmuscle/test/test_instance.py
@@ -50,7 +50,7 @@ def instance(sys_argv_instance):
         mmp_client_object = MagicMock()
         mmp_client_object.request_peers.return_value = (None, None, None)
         checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None)
-        mmp_client_object.register_instance.return_value = checkpoint_info
+        mmp_client_object.get_checkpoint_info.return_value = checkpoint_info
         mmp_client.return_value = mmp_client_object
 
         instance = Instance({
@@ -68,7 +68,7 @@ def instance2(sys_argv_instance):
         mmp_client_object = MagicMock()
         mmp_client_object.request_peers.return_value = (None, None, None)
         checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None)
-        mmp_client_object.register_instance.return_value = checkpoint_info
+        mmp_client_object.get_checkpoint_info.return_value = checkpoint_info
         mmp_client.return_value = mmp_client_object
         instance = Instance({
             Operator.F_INIT: ['in[]'],
@@ -83,7 +83,7 @@ def test_create_instance(
         mmp_client_object = MagicMock()
         mmp_client_object.request_peers.return_value = (None, None, None)
         checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None)
-        mmp_client_object.register_instance.return_value = checkpoint_info
+        mmp_client_object.get_checkpoint_info.return_value = checkpoint_info
         mmp_client.return_value = mmp_client_object
         ports = {
             Operator.F_INIT: ['in'],
diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
index bfee09a1..462c4cd9 100644
--- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py
+++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
@@ -19,7 +19,7 @@ def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path
     communicator.get_message_counts.return_value = {}
     snapshot_manager = SnapshotManager(Reference('test'), manager, communicator)
 
-    snapshot_manager.set_checkpoint_info(
+    snapshot_manager._set_checkpoint_info(
             datetime.now(timezone.utc), Checkpoints(), None)
 
     snapshot_manager.reuse_instance(None, Path(tmp_path))
@@ -45,7 +45,7 @@ def test_save_load_checkpoint(tmp_path: Path) -> None:
     snapshot_manager = SnapshotManager(instance_id, manager, communicator)
 
     checkpoints = Checkpoints(simulation_time=[CheckpointRangeRule(every=1)])
-    snapshot_manager.set_checkpoint_info(
+    snapshot_manager._set_checkpoint_info(
             datetime.now(timezone.utc), checkpoints, None)
 
     snapshot_manager.reuse_instance(None, tmp_path)
@@ -73,7 +73,7 @@ def test_save_load_checkpoint(tmp_path: Path) -> None:
 
     snapshot_manager2 = SnapshotManager(instance_id, manager, communicator)
 
-    snapshot_manager2.set_checkpoint_info(
+    snapshot_manager2._set_checkpoint_info(
             datetime.now(timezone.utc), checkpoints, fpath)
     communicator.restore_message_counts.assert_called_with(port_message_counts)
 

From d3bb5cdeedb2b712ce9ab953f61c916eceaa33af Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Wed, 16 Nov 2022 13:52:10 +0100
Subject: [PATCH 075/183] Revert std::move additions

Give warnings in clang++ version 10, but not in version 14.
Having std::move prevents RVO, so removing it.
---
 libmuscle/cpp/src/libmuscle/data.cpp                     | 2 +-
 libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp | 2 +-
 libmuscle/cpp/src/libmuscle/mpp_message.cpp              | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libmuscle/cpp/src/libmuscle/data.cpp b/libmuscle/cpp/src/libmuscle/data.cpp
index 666a8423..53b57694 100644
--- a/libmuscle/cpp/src/libmuscle/data.cpp
+++ b/libmuscle/cpp/src/libmuscle/data.cpp
@@ -962,7 +962,7 @@ DataConstRef DataConstRef::grid_data_<bool>(
         Data result = Data::byte_array(num_elems);
         char * data_copy = result.as_byte_array();
         std::copy(data, data + num_elems, data_copy);
-        return std::move(result);
+        return result;
     }
 }
 
diff --git a/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp b/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp
index c0e95b90..959737d2 100644
--- a/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp
+++ b/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp
@@ -129,7 +129,7 @@ DataConstRef TcpTransportClient::call(
     int64_t length = recv_int64(socket_fd_);
     auto result = Data::byte_array(length);
     recv_all(socket_fd_, result.as_byte_array(), result.size());
-    return std::move(result);
+    return result;
 }
 
 void TcpTransportClient::close() {
diff --git a/libmuscle/cpp/src/libmuscle/mpp_message.cpp b/libmuscle/cpp/src/libmuscle/mpp_message.cpp
index 5f796224..2962e31c 100644
--- a/libmuscle/cpp/src/libmuscle/mpp_message.cpp
+++ b/libmuscle/cpp/src/libmuscle/mpp_message.cpp
@@ -76,7 +76,7 @@ DataConstRef MPPMessage::encoded() const {
     auto bytes = Data::byte_array(sbuf.size());
     memcpy(bytes.as_byte_array(), sbuf.data(), sbuf.size());
 
-    return std::move(bytes);
+    return bytes;
 }
 
 } }

From c697d532075ac871892a562c743f10f008e98eec Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Wed, 16 Nov 2022 13:52:58 +0100
Subject: [PATCH 076/183] Removing const in method that returns by value

---
 libmuscle/cpp/src/libmuscle/peer_manager.cpp | 2 +-
 libmuscle/cpp/src/libmuscle/peer_manager.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libmuscle/cpp/src/libmuscle/peer_manager.cpp b/libmuscle/cpp/src/libmuscle/peer_manager.cpp
index 51772072..d5e8923c 100644
--- a/libmuscle/cpp/src/libmuscle/peer_manager.cpp
+++ b/libmuscle/cpp/src/libmuscle/peer_manager.cpp
@@ -65,7 +65,7 @@ std::vector<std::string> PeerManager::get_peer_locations(
     return peer_locations_.at(peer_instance);
 }
 
-std::vector<Endpoint> const PeerManager::get_peer_endpoints(
+std::vector<Endpoint> PeerManager::get_peer_endpoints(
         Identifier const & port,
         std::vector<int> const & slot
         ) const
diff --git a/libmuscle/cpp/src/libmuscle/peer_manager.hpp b/libmuscle/cpp/src/libmuscle/peer_manager.hpp
index c6ac5ff7..024b40fa 100644
--- a/libmuscle/cpp/src/libmuscle/peer_manager.hpp
+++ b/libmuscle/cpp/src/libmuscle/peer_manager.hpp
@@ -83,7 +83,7 @@ class PeerManager {
          * @param slot The slot to send or receive on.
          * @return The peer endpoints.
          */
-        std::vector<Endpoint> const get_peer_endpoints(
+        std::vector<Endpoint> get_peer_endpoints(
                 ymmsl::Identifier const & port,
                 std::vector<int> const & slot) const;
 

From 06fdcdffe255e948a6bbbd3548379fa92d1f64f9 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Wed, 16 Nov 2022 17:10:42 +0100
Subject: [PATCH 077/183] More unified integration tests with native code

---
 integration_test/conftest.py                 | 118 ++++++++++++++-----
 integration_test/test_cpp_macro_micro.py     |  48 ++------
 integration_test/test_fortran_macro_micro.py |  54 ++-------
 integration_test/test_multicast_cpp.py       |  66 ++---------
 4 files changed, 113 insertions(+), 173 deletions(-)

diff --git a/integration_test/conftest.py b/integration_test/conftest.py
index 76cf680d..ad59842a 100644
--- a/integration_test/conftest.py
+++ b/integration_test/conftest.py
@@ -1,6 +1,9 @@
 import logging
 import multiprocessing as mp
 import os
+import subprocess
+import sys
+from contextlib import contextmanager, ExitStack
 from pathlib import Path
 
 import pytest
@@ -47,9 +50,80 @@ def make_server_process(ymmsl_doc, tmpdir):
     process.join()
 
 
+def _python_wrapper(instance_name, muscle_manager, callable):
+    sys.argv.append(f'--muscle-instance={instance_name}')
+    sys.argv.append(f'--muscle-manager={muscle_manager}')
+    callable()
+
+
+def run_manager_with_actors(
+        ymmsl_text, tmpdir,
+        cpp_actors={}, fortran_actors={}, python_actors={}):
+    """Start muscle_manager along with C++ and python actors.
+
+    C++ actors are a dict of instance->executable_path. Executable paths are
+    assumed to be relative to ../libmuscle/cpp/build/. LD_LIBRARY_PATH is
+    automatically updated to include the msgpack library path.
+
+    Fortran actors are a dict of instance->executable_path. Executable paths are
+    assumed to be relative to ../libmuscle/fortran/build/. LD_LIBRARY_PATH is
+    automatically updated to include the msgpack library path.
+
+    Python actors are a dict of instance->callable, where the callable
+    implements the python actor.
+    """
+    env = os.environ.copy()
+    ymmsl_doc = ymmsl.load(ymmsl_text)
+    libmuscle_dir = Path(__file__).parents[1] / 'libmuscle'
+    cpp_build_dir = libmuscle_dir / 'cpp' / 'build'
+    fortran_build_dir = libmuscle_dir / 'fortran' / 'build'
+
+    with ExitStack() as stack:
+        # start muscle_manager and extract manager location
+        ctx = contextmanager(make_server_process)(ymmsl_doc, tmpdir)
+        env['MUSCLE_MANAGER'] = stack.enter_context(ctx)
+
+        lib_paths = [cpp_build_dir / 'msgpack' / 'msgpack' / 'lib']
+        if 'LD_LIBRARY_PATH' in env:
+            env['LD_LIBRARY_PATH'] += ':' + ':'.join(map(str, lib_paths))
+        else:
+            env['LD_LIBRARY_PATH'] = ':'.join(map(str, lib_paths))
+
+        native_processes = []
+        # start native actors
+        for actors, build_dir in ((cpp_actors, cpp_build_dir),
+                                  (fortran_actors, fortran_build_dir)):
+            for instance_name, executable_path in actors.items():
+                executable = build_dir / executable_path
+                f_out = stack.enter_context(
+                        (tmpdir / f'{instance_name}_stdout.txt').open('w'))
+                f_err = stack.enter_context(
+                        (tmpdir / f'{instance_name}_stderr.txt').open('w'))
+                native_processes.append(subprocess.Popen(
+                        [str(executable), f'--muscle-instance={instance_name}'],
+                        env=env, stdout=f_out, stderr=f_err))
+
+        # start python actors
+        python_processes = []
+        for instance_name, callable in python_actors.items():
+            proc = mp.Process(
+                    target=_python_wrapper,
+                    args=(instance_name, env['MUSCLE_MANAGER'], callable))
+            proc.start()
+            python_processes.append(proc)
+
+        # check results
+        for proc in native_processes:
+            proc.wait()
+            assert proc.returncode == 0
+        for proc in python_processes:
+            proc.join()
+            assert proc.exitcode == 0
+
+
 @pytest.fixture
-def mmp_server_process(yatiml_log_warning, tmpdir):
-    ymmsl_text = (
+def mmp_server_config(yatiml_log_warning):
+    return (
             'ymmsl_version: v0.1\n'
             'model:\n'
             '  name: test_model\n'
@@ -74,14 +148,17 @@ def mmp_server_process(yatiml_log_warning, tmpdir):
             '  macro_implementation: macro.py\n'
             '  micro_implementation: micro.py\n'
             )
-    ymmsl_doc = ymmsl.load(ymmsl_text)
 
+
+@pytest.fixture
+def mmp_server_process(mmp_server_config, tmpdir):
+    ymmsl_doc = ymmsl.load(mmp_server_config)
     yield from make_server_process(ymmsl_doc, tmpdir)
 
 
 @pytest.fixture
-def mmp_server_process_simple(tmpdir, yatiml_log_warning):
-    ymmsl_text = (
+def mmp_server_config_simple(yatiml_log_warning):
+    return (
             'ymmsl_version: v0.1\n'
             'model:\n'
             '  name: test_model\n'
@@ -101,36 +178,17 @@ def mmp_server_process_simple(tmpdir, yatiml_log_warning):
             '    - [1.0, 2.0]\n'
             '    - [3.0, 1.0]\n'
             )
-    ymmsl_doc = ymmsl.load(ymmsl_text)
 
+
+@pytest.fixture
+def mmp_server_process_simple(mmp_server_config_simple, tmpdir):
+    ymmsl_doc = ymmsl.load(mmp_server_config_simple)
     yield from make_server_process(ymmsl_doc, tmpdir)
 
 
 @pytest.fixture
-def mmp_server(yatiml_log_warning):
-    ymmsl_text = (
-            'ymmsl_version: v0.1\n'
-            'model:\n'
-            '  name: test_model\n'
-            '  components:\n'
-            '    macro: macro_implementation\n'
-            '    micro:\n'
-            '      implementation: micro_implementation\n'
-            '      multiplicity: [10]\n'
-            '  conduits:\n'
-            '    macro.out: micro.in\n'
-            '    micro.out: macro.in\n'
-            'settings:\n'
-            '  test1: 13\n'
-            '  test2: 13.3\n'
-            '  test3: testing\n'
-            '  test4: True\n'
-            '  test5: [2.3, 5.6]\n'
-            '  test6:\n'
-            '    - [1.0, 2.0]\n'
-            '    - [3.0, 1.0]\n'
-            )
-    ymmsl_doc = ymmsl.load(ymmsl_text)
+def mmp_server(mmp_server_config_simple, yatiml_log_warning):
+    ymmsl_doc = ymmsl.load(mmp_server_config_simple)
 
     manager = Manager(ymmsl_doc)
     yield manager._server
diff --git a/integration_test/test_cpp_macro_micro.py b/integration_test/test_cpp_macro_micro.py
index 02156a44..fda1a232 100644
--- a/integration_test/test_cpp_macro_micro.py
+++ b/integration_test/test_cpp_macro_micro.py
@@ -1,21 +1,11 @@
-import multiprocessing as mp
-import os
 from pathlib import Path
-import subprocess
-import sys
 
 import numpy as np
 
 from libmuscle import Instance, Message
 from ymmsl import Operator
 
-from .conftest import skip_if_python_only
-
-
-def run_macro(instance_id: str, manager_location: str):
-    sys.argv.append(f'--muscle-instance={instance_id}')
-    sys.argv.append(f'--muscle-manager={manager_location}')
-    macro()
+from .conftest import skip_if_python_only, run_manager_with_actors
 
 
 def macro():
@@ -47,34 +37,12 @@ def macro():
 
 
 @skip_if_python_only
-def test_cpp_macro_micro(mmp_server_process_simple, tmp_path):
+def test_cpp_macro_micro(mmp_server_config_simple, tmp_path):
     # create C++ micro model
     # see libmuscle/cpp/src/libmuscle/tests/micro_model_test.cpp
-    cpp_build_dir = Path(__file__).parents[1] / 'libmuscle' / 'cpp' / 'build'
-    env = os.environ.copy()
-    lib_paths = [cpp_build_dir / 'msgpack' / 'msgpack' / 'lib']
-    if 'LD_LIBRARY_PATH' in env:
-        env['LD_LIBRARY_PATH'] += ':' + ':'.join(map(str, lib_paths))
-    else:
-        env['LD_LIBRARY_PATH'] = ':'.join(map(str, lib_paths))
-
-    env['MUSCLE_MANAGER'] = mmp_server_process_simple
-    cpp_test_dir = cpp_build_dir / 'libmuscle' / 'tests'
-    cpp_test_micro = cpp_test_dir / 'micro_model_test'
-
-    with (tmp_path / 'cpp_stdout.txt').open('w') as f_out:
-        with (tmp_path / 'cpp_stderr.txt').open('w') as f_err:
-            micro_result = subprocess.Popen(
-                    [str(cpp_test_micro), '--muscle-instance=micro'], env=env,
-                    stdout=f_out, stderr=f_err)
-
-    # run macro model
-    macro_process = mp.Process(
-            target=run_macro, args=('macro', mmp_server_process_simple))
-    macro_process.start()
-
-    # check results
-    micro_result.wait()
-    assert micro_result.returncode == 0
-    macro_process.join()
-    assert macro_process.exitcode == 0
+    run_manager_with_actors(
+            mmp_server_config_simple,
+            tmp_path,
+            {'micro': Path('libmuscle') / 'tests' / 'micro_model_test'},
+            {},
+            {'macro': macro})
diff --git a/integration_test/test_fortran_macro_micro.py b/integration_test/test_fortran_macro_micro.py
index e4908b75..0717891c 100644
--- a/integration_test/test_fortran_macro_micro.py
+++ b/integration_test/test_fortran_macro_micro.py
@@ -1,21 +1,11 @@
-import multiprocessing as mp
-import os
 from pathlib import Path
-import subprocess
-import sys
 
 import numpy as np
 
 from libmuscle import Instance, Message
 from ymmsl import Operator
 
-from .conftest import skip_if_python_only
-
-
-def run_macro(instance_id: str, manager_location: str):
-    sys.argv.append(f'--muscle-instance={instance_id}')
-    sys.argv.append(f'--muscle-manager={manager_location}')
-    macro()
+from .conftest import skip_if_python_only, run_manager_with_actors
 
 
 def macro():
@@ -48,40 +38,12 @@ def macro():
 
 
 @skip_if_python_only
-def test_fortran_macro_micro(mmp_server_process_simple, tmp_path):
+def test_fortran_macro_micro(mmp_server_config_simple, tmp_path):
     # create Fortran micro model
     # see libmuscle/fortran/src/libmuscle/tests/fortran_micro_model_test.f90
-    cpp_build_dir = Path(__file__).parents[1] / 'libmuscle' / 'cpp' / 'build'
-    env = os.environ.copy()
-    lib_paths = [cpp_build_dir / 'msgpack' / 'msgpack' / 'lib']
-    if 'LD_LIBRARY_PATH' in env:
-        env['LD_LIBRARY_PATH'] += ':' + ':'.join(map(str, lib_paths))
-    else:
-        env['LD_LIBRARY_PATH'] = ':'.join(map(str, lib_paths))
-
-    env['MUSCLE_MANAGER'] = mmp_server_process_simple
-
-    fortran_test_dir = (
-            Path(__file__).parents[1] / 'libmuscle' / 'fortran' / 'build' /
-            'libmuscle' / 'tests')
-    fortran_test_micro = fortran_test_dir / 'fortran_micro_model_test'
-
-    with (tmp_path / 'fortran_stdout.txt').open('w') as f_out:
-        with (tmp_path / 'fortran_stderr.txt').open('w') as f_err:
-            micro_result = subprocess.Popen(
-                    [
-                        str(fortran_test_micro), '--muscle-instance=micro',
-                        f'--muscle-manager={mmp_server_process_simple}'
-                        ], env=env, stdout=f_out, stderr=f_err)
-
-    # run macro model
-    macro_process = mp.Process(
-            target=run_macro,
-            args=('macro', mmp_server_process_simple))
-    macro_process.start()
-
-    # check results
-    micro_result.wait()
-    assert micro_result.returncode == 0
-    macro_process.join()
-    assert macro_process.exitcode == 0
+    run_manager_with_actors(
+            mmp_server_config_simple,
+            tmp_path,
+            {},
+            {'micro': Path('libmuscle') / 'tests' / 'fortran_micro_model_test'},
+            {'macro': macro})
diff --git a/integration_test/test_multicast_cpp.py b/integration_test/test_multicast_cpp.py
index d97fc0d6..7daa62d3 100644
--- a/integration_test/test_multicast_cpp.py
+++ b/integration_test/test_multicast_cpp.py
@@ -1,18 +1,10 @@
 from pathlib import Path
-import sys
 
 import ymmsl
 
 from libmuscle import Instance
-from libmuscle.manager.manager import Manager
-from libmuscle.manager.run_dir import RunDir
 
-# when executing this file as a component, .conftest cannot be resolved
-if __name__ == "__main__":
-    def skip_if_python_only(func):
-        return func
-else:
-    from .conftest import skip_if_python_only
+from .conftest import skip_if_python_only, run_manager_with_actors
 
 
 def receiver():
@@ -28,19 +20,9 @@ def receiver():
 
 
 @skip_if_python_only
-def test_multicast_cpp(tmpdir):
-    tmppath = Path(str(tmpdir))
-
-    # find our test component and its requirements
-    cpp_build_dir = Path(__file__).parents[1] / 'libmuscle' / 'cpp' / 'build'
-    lib_paths = [cpp_build_dir / 'msgpack' / 'msgpack' / 'lib']
-    ld_lib_path = ':'.join(map(str, lib_paths))
-
-    cpp_test_dir = cpp_build_dir / 'libmuscle' / 'tests'
-    test_component = cpp_test_dir / 'component_test'
-
-    # make config
-    ymmsl_text = f"""
+def test_multicast_cpp(tmp_path):
+    run_manager_with_actors(
+        """
 ymmsl_version: v0.1
 model:
   name: test_model
@@ -54,38 +36,8 @@ def test_multicast_cpp(tmpdir):
   conduits:
     multicast.out:
     - receiver1.in
-    - receiver2.in
-implementations:
-  component:
-    env:
-      LD_LIBRARY_PATH: {ld_lib_path}
-    executable: {test_component}
-  receiver:
-    executable: {sys.executable}
-    args:
-    - {__file__}
-resources:
-  multicast:
-    threads: 1
-  receiver1:
-    threads: 1
-  receiver2:
-    threads: 1"""
-
-    config = ymmsl.load(ymmsl_text)
-    config.as_configuration().check_consistent()
-
-    # set up
-    run_dir = RunDir(tmppath / 'run')
-
-    # launch MUSCLE Manager with simulation
-    manager = Manager(config, run_dir)
-    manager.start_instances()
-    success = manager.wait()
-
-    # check that all did not go well
-    assert success
-
-
-if __name__ == "__main__":
-    receiver()
+    - receiver2.in""",
+        tmp_path,
+        {'multicast': Path('libmuscle') / 'tests' / 'component_test'},
+        {},
+        {'receiver1': receiver, 'receiver2': receiver})

From 634f9fdfe2d1947078e6704969499aa6b8f1b559 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 17 Nov 2022 10:07:41 +0100
Subject: [PATCH 078/183] Update checkpoint trigger logic

---
 .../python/libmuscle/checkpoint_triggers.py   | 60 ++++++++++++-------
 libmuscle/python/libmuscle/instance.py        | 15 +++++
 .../python/libmuscle/snapshot_manager.py      | 27 +++++----
 3 files changed, 68 insertions(+), 34 deletions(-)

diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py
index ff12918b..dbb4f321 100644
--- a/libmuscle/python/libmuscle/checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/checkpoint_triggers.py
@@ -203,6 +203,8 @@ def set_checkpoint_info(
         self._has_checkpoints = True
         self._monotonic_reference = _utc_to_monotonic(utc_reference)
 
+        self._checkpoint_at_end = checkpoints.at_end
+
         self._wall = CombinedCheckpointTriggers(checkpoints.wallclock_time)
         self._prevwall = 0.0
         self._nextwall = self._wall.next_checkpoint(0.0)  # type: Optional[float]
@@ -213,7 +215,6 @@ def set_checkpoint_info(
         self._sim_reset = True
 
         self._first_reuse = True
-        self._max_f_init_next_timestamp = None  # type: Optional[float]
 
         # These attributes are only used to check if implementations are
         # following the guidelines
@@ -226,44 +227,47 @@ def elapsed_walltime(self) -> float:
         """
         return time.monotonic() - self._monotonic_reference
 
+    def snapshots_enabled(self) -> bool:
+        """Check if the current workflow has snapshots enabled.
+        """
+        return self._has_checkpoints
+
     def should_save_snapshot(self, timestamp: float) -> bool:
         """Handles instance.should_save_snapshot
         """
         if not self._has_checkpoints:
             return False
 
-        if self._should_have_saved:
-            _checkpoint_error('"should_save_snapshot" or '
-                              '"should_save_final_snapshot" returned positive'
-                              ' but no snapshot was saved before the next call')
+        self.__check_should_have_saved()
 
         elapsed_walltime = self.elapsed_walltime()
         value = self.__should_save(elapsed_walltime, timestamp)
         self._should_have_saved = value
         return value
 
-    def should_save_final_snapshot(self) -> bool:
+    def should_save_final_snapshot(
+            self, do_reuse: bool, f_init_max_timestamp: Optional[float]
+            ) -> bool:
         """Handles instance.should_save_final_snapshot
         """
         if not self._has_checkpoints:
             return False
 
-        if self._should_have_saved:
-            _checkpoint_error('"should_save_snapshot" or '
-                              '"should_save_final_snapshot" returned positive'
-                              ' but no snapshot was saved before the next call')
+        self.__check_should_have_saved()
 
         value = False
-        if self._max_f_init_next_timestamp is None:
-            # If the messages on F_INIT do not supply a next_timestamp, we will
-            # always snapshot just before O_I
+        if not do_reuse and self._checkpoint_at_end:
             value = True
-            self._last_triggers = ['No "next_timestamp" provided on F_INIT'
-                                   ' messages']
+            self._last_triggers.append('at_end')
+        elif f_init_max_timestamp is None:
+            # No F_INIT messages received: reuse triggered on muscle_settings_in
+            # message.
+            _logger.debug('Reuse triggered by muscle_settings_in.'
+                          ' Not creating a snapshot.')
+            self._sim_reset = True
         else:
             elapsed_walltime = self.elapsed_walltime()
-            value = self.__should_save(elapsed_walltime,
-                                       self._max_f_init_next_timestamp)
+            value = self.__should_save(elapsed_walltime, f_init_max_timestamp)
 
         self._should_have_saved = value
         self._should_save_final_called = True
@@ -303,16 +307,15 @@ def update_checkpoints(self, timestamp: float, final: bool) -> None:
                          ' Instance.snapshots_enabled() to test if it is useful'
                          ' to save a snapshot.')
             return
+        if final and self._saved_final_checkpoint:
+            raise RuntimeError(
+                    'You may only save a final snapshot once per reuse loop.')
 
         self._prevwall = self.elapsed_walltime()
         self._nextwall = self._wall.next_checkpoint(self._prevwall)
 
-        if final and self._max_f_init_next_timestamp is not None:
-            simulation_time = self._max_f_init_next_timestamp
-        else:
-            simulation_time = timestamp
-        self._prevsim = simulation_time
-        self._nextsim = self._sim.next_checkpoint(simulation_time)
+        self._prevsim = timestamp
+        self._nextsim = self._sim.next_checkpoint(timestamp)
 
         # this method is also called during resume, after which we no longer
         # consider the simulation_time as reset
@@ -327,6 +330,17 @@ def get_triggers(self) -> List[str]:
         self._last_triggers = []
         return triggers
 
+    def __check_should_have_saved(self) -> None:
+        """Check if a snapshot is saved when required."""
+        if self._should_have_saved:
+            _checkpoint_error('"should_save_snapshot" or '
+                              '"should_save_final_snapshot" returned positive'
+                              ' but no snapshot was saved before the next call'
+                              ' to a should_save_ method.'
+                              ' You must call the corresponding save_snapshot'
+                              ' or save_final_snapshot method when should_save_'
+                              ' returns True.')
+
     def __should_save(self, walltime: float, simulation_time: float) -> bool:
         """Check if a checkpoint should be taken
 
diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index c064e6af..44525347 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -390,6 +390,17 @@ def receive_with_settings(
         """
         return self.__receive_message(port_name, slot, default, True)
 
+    def snapshots_enabled(self) -> bool:
+        """Check if the current workflow has snapshots enabled.
+
+        When snapshots are not enabled, all calls to should_save_snapshot and
+        should_save_final_snapshot will return False.
+
+        Returns:
+            True iff checkpoint rules are defined in the workflow yMMSL.
+        """
+        return self._snapshot_manager.snapshots_enabled()
+
     def resuming(self) -> bool:
         """Check if this instance is resuming from a snapshot.
 
@@ -517,6 +528,10 @@ def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool:
             True iff a final snapshot should be taken by the submodel according
             to the checkpoint rules provided in the ymmsl configuration.
         """
+        if self._do_reuse is not None:
+            raise RuntimeError(
+                    'You may not call should_save_final_snapshot more than once'
+                    ' per reuse loop.')
         self._do_reuse = self.__check_reuse_instance(apply_overlay)
         f_init_max_timestamp = max(
                 (msg.timestamp for msg in self._f_init_cache.values()),
diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py
index ffd7e4b2..00c1a4ca 100644
--- a/libmuscle/python/libmuscle/snapshot_manager.py
+++ b/libmuscle/python/libmuscle/snapshot_manager.py
@@ -41,7 +41,7 @@ def __init__(self,
         self._manager = manager
 
         self._first_reuse = True
-        self._trigger = TriggerManager()
+        self._trigger_manager = TriggerManager()
         self._resume_from_snapshot = None   # type: Optional[Snapshot]
         self._snapshot_directory = None     # type: Optional[Path]
         self._next_snapshot_num = 1
@@ -63,13 +63,13 @@ def _set_checkpoint_info(self,
             checkpoints: requested workflow checkpoints
             resume: previous snapshot to resume from (or None if not resuming)
         """
-        self._trigger.set_checkpoint_info(utc_reference, checkpoints)
+        self._trigger_manager.set_checkpoint_info(utc_reference, checkpoints)
         if resume is not None:
             self.__load_snapshot(resume)
             snapshot = cast(Snapshot, self._resume_from_snapshot)
             self._communicator.restore_message_counts(
                 snapshot.port_message_counts)
-            self._trigger.update_checkpoints(
+            self._trigger_manager.update_checkpoints(
                 snapshot.message.timestamp,
                 snapshot.is_final_snapshot)
 
@@ -79,7 +79,7 @@ def reuse_instance(self, snapshot_directory: Optional[Path]) -> None:
         Args:
             snapshot_directory: Path to store this instance's snapshots in.
         """
-        self._trigger.reuse_instance()
+        self._trigger_manager.reuse_instance()
 
         self._snapshot_directory = snapshot_directory
 
@@ -88,6 +88,11 @@ def reuse_instance(self, snapshot_directory: Optional[Path]) -> None:
         else:
             self._resume_from_snapshot = None
 
+    def snapshots_enabled(self) -> bool:
+        """Check if the current workflow has snapshots enabled.
+        """
+        return self._trigger_manager.snapshots_enabled()
+
     def resuming(self) -> bool:
         """Check if we are resuming during this reuse iteration.
         """
@@ -115,15 +120,15 @@ def load_snapshot(self) -> Message:
     def should_save_snapshot(self, timestamp: float) -> bool:
         """See :meth:`TriggerManager.should_save_snapshot`
         """
-        return self._trigger.should_save_snapshot(timestamp)
+        return self._trigger_manager.should_save_snapshot(timestamp)
 
     def should_save_final_snapshot(
-            self, do_reuse: bool,
-            f_init_max_timestamp: Optional[float]
+            self, do_reuse: bool, f_init_max_timestamp: Optional[float]
             ) -> bool:
         """See :meth:`TriggerManager.should_save_final_snapshot`
         """
-        return self._trigger.should_save_final_snapshot()
+        return self._trigger_manager.should_save_final_snapshot(
+                do_reuse, f_init_max_timestamp)
 
     def save_snapshot(self, msg: Message) -> None:
         """Save snapshot contained in the message object.
@@ -142,8 +147,8 @@ def __save_snapshot(self, msg: Message, final: bool) -> None:
             msg: message object representing the snapshot
             final: True iff called from save_final_snapshot
         """
-        triggers = self._trigger.get_triggers()
-        wallclock_time = self._trigger.elapsed_walltime()
+        triggers = self._trigger_manager.get_triggers()
+        wallclock_time = self._trigger_manager.elapsed_walltime()
 
         port_message_counts = self._communicator.get_message_counts()
         snapshot = MsgPackSnapshot(
@@ -153,7 +158,7 @@ def __save_snapshot(self, msg: Message, final: bool) -> None:
         metadata = SnapshotMetadata.from_snapshot(snapshot, str(path))
         self._manager.submit_snapshot_metadata(self._instance_id, metadata)
 
-        self._trigger.update_checkpoints(msg.timestamp, final)
+        self._trigger_manager.update_checkpoints(msg.timestamp, final)
 
     def __load_snapshot(self, snapshot_location: Path) -> None:
         """Load a previously stored snapshot from the filesystem

From a808a9e870e3a1657b7246b60c41daa82fcc7088 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 17 Nov 2022 10:45:36 +0100
Subject: [PATCH 079/183] Update tox.ini: pass cmdline args to pytest

Example:
`tox -- --lf` to rerun failed pytest tests
---
 tox.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tox.ini b/tox.ini
index 717d5107..fcaa0c30 100644
--- a/tox.ini
+++ b/tox.ini
@@ -18,7 +18,7 @@ passenv =
 
 commands =
     mypy
-    pytest
+    pytest {posargs}
     flake8 libmuscle/python/libmuscle integration_test
 
 [gh-actions]

From 10d8fb82df0525b2c35b3cc45002a7e4c9510c0b Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 17 Nov 2022 10:51:10 +0100
Subject: [PATCH 080/183] Update checkpoint trigger tests

---
 .../python/libmuscle/checkpoint_triggers.py   |  7 ++-
 libmuscle/python/libmuscle/instance.py        |  6 +-
 .../python/libmuscle/snapshot_manager.py      | 17 ++++--
 .../test/test_checkpoint_triggers.py          | 60 +++++++++----------
 4 files changed, 49 insertions(+), 41 deletions(-)

diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py
index dbb4f321..61b6cdca 100644
--- a/libmuscle/python/libmuscle/checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/checkpoint_triggers.py
@@ -294,11 +294,12 @@ def reuse_instance(self) -> None:
             self._saved_final_checkpoint = False
 
     def update_checkpoints(self, timestamp: float, final: bool) -> None:
-        """Update last and next checkpoint times when a snapshot is made
+        """Update last and next checkpoint times when a snapshot is made.
 
         Args:
-            timestamp: timestamp as reported by the instance
-            next_timestamp: next timestamp as reported by the instance
+            timestamp: timestamp as reported by the instance (or from incoming
+                F_INIT messages when final=True).
+            final: True iff this is coming from a save_final_snapshot call.
         """
         if not self._has_checkpoints:
             _logger.info('Saving a snapshot, but no snapshots requested by the'
diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index 44525347..efbb9b92 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -560,7 +560,11 @@ def save_final_snapshot(self, message: Message) -> None:
                 attribute can be used to store the internal state of the
                 submodel.
         """
-        return self._snapshot_manager.save_final_snapshot(message)
+        f_init_max_timestamp = max(
+                (msg.timestamp for msg in self._f_init_cache.values()),
+                default=None)
+        return self._snapshot_manager.save_final_snapshot(
+                message, f_init_max_timestamp)
 
     def _register(self) -> None:
         """Register this instance with the manager.
diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py
index 00c1a4ca..5f67bfed 100644
--- a/libmuscle/python/libmuscle/snapshot_manager.py
+++ b/libmuscle/python/libmuscle/snapshot_manager.py
@@ -135,12 +135,16 @@ def save_snapshot(self, msg: Message) -> None:
         """
         self.__save_snapshot(msg, False)
 
-    def save_final_snapshot(self, msg: Message) -> None:
+    def save_final_snapshot(
+            self, msg: Message, f_init_max_timestamp: Optional[float]) -> None:
         """Save final snapshot contained in the message object
         """
-        self.__save_snapshot(msg, True)
+        self.__save_snapshot(msg, True, f_init_max_timestamp)
 
-    def __save_snapshot(self, msg: Message, final: bool) -> None:
+    def __save_snapshot(
+            self, msg: Message, final: bool,
+            f_init_max_timestamp: Optional[float] = None
+            ) -> None:
         """Actual implementation used by save_(final_)snapshot.
 
         Args:
@@ -158,7 +162,12 @@ def __save_snapshot(self, msg: Message, final: bool) -> None:
         metadata = SnapshotMetadata.from_snapshot(snapshot, str(path))
         self._manager.submit_snapshot_metadata(self._instance_id, metadata)
 
-        self._trigger_manager.update_checkpoints(msg.timestamp, final)
+        timestamp = msg.timestamp
+        if final and f_init_max_timestamp is not None:
+            # For final snapshots f_init_max_snapshot is the reference time (see
+            # should_save_Final_snapshot).
+            timestamp = f_init_max_timestamp
+        self._trigger_manager.update_checkpoints(timestamp, final)
 
     def __load_snapshot(self, snapshot_location: Path) -> None:
         """Load a previously stored snapshot from the filesystem
diff --git a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
index 4f6eed0d..0cbf47b2 100644
--- a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
@@ -148,58 +148,54 @@ def test_trigger_manager_reference_time():
     assert 15.0 < elapsed_walltime <= (15.0 + elapsed_monotonic)
 
 
-@pytest.mark.skip("To be updated")
 def test_trigger_manager():
     reference = datetime.now(timezone.utc)
     trigger_manager = TriggerManager()
     trigger_manager.set_checkpoint_info(reference, Checkpoints(
+            at_end=True,
             wallclock_time=[CheckpointAtRule([1e-12])],
             simulation_time=[CheckpointAtRule([1, 3, 5])]))
 
-    trigger_manager.reuse_instance(7)
+    trigger_manager.reuse_instance()
 
-    t, t_next = 0.1, 0.2
-    assert trigger_manager.should_save_snapshot(t, t_next)
+    assert trigger_manager.should_save_snapshot(0.1)
     triggers = trigger_manager.get_triggers()
     assert len(triggers) == 1
     assert "wallclock_time" in triggers[0]
     with pytest.raises(RuntimeError):  # did not call save in between
-        trigger_manager.should_save_snapshot(t, t_next)
-    trigger_manager.update_checkpoints(t, t_next, False)
+        trigger_manager.should_save_snapshot(0.1)
+    trigger_manager.update_checkpoints(0.1, False)
 
-    t, t_next = 0.2, 0.9
-    assert not trigger_manager.should_save_snapshot(t, t_next)
+    assert not trigger_manager.should_save_snapshot(0.99)
 
-    t, t_next = 0.9, 3.1
-    assert trigger_manager.should_save_snapshot(t, t_next)
-    assert len(trigger_manager.get_triggers()) == 1
-    trigger_manager.update_checkpoints(t, t_next, False)
+    assert trigger_manager.should_save_snapshot(3.2)
+    triggers = trigger_manager.get_triggers()
+    assert len(triggers) == 1
+    assert "simulation_time" in triggers[0]
+    trigger_manager.update_checkpoints(3.2, False)
 
-    t, t_next = 3.1, None
-    assert trigger_manager.should_save_final_snapshot(t)
+    assert trigger_manager.should_save_final_snapshot(True, 7.0)
     with pytest.raises(RuntimeError):  # did not call save in between
-        trigger_manager.should_save_snapshot(t, 4.0)
+        trigger_manager.should_save_snapshot(4.0)
     with pytest.raises(RuntimeError):  # did not call save in between
-        trigger_manager.should_save_final_snapshot(t)
+        trigger_manager.should_save_final_snapshot(True, 7.0)
     assert len(trigger_manager.get_triggers()) > 0
-    trigger_manager.update_checkpoints(t, t_next, True)
+    trigger_manager.update_checkpoints(7.0, True)
 
-    trigger_manager.reuse_instance(None)
+    trigger_manager.reuse_instance()
 
-    t, t_next = 7.1, 8.2
-    assert not trigger_manager.should_save_snapshot(t, t_next)
+    assert not trigger_manager.should_save_snapshot(7.1)
     with pytest.raises(RuntimeError):  # no should_save_final called
-        trigger_manager.reuse_instance(None)
-    t, t_next = 8.2, None
-    assert trigger_manager.should_save_final_snapshot(t)
+        trigger_manager.reuse_instance()
+
+    assert trigger_manager.should_save_final_snapshot(False, None)
     with pytest.raises(RuntimeError):  # not saved
-        trigger_manager.reuse_instance(None)
-    trigger_manager.update_checkpoints(t, t_next, True)
+        trigger_manager.reuse_instance()
+    trigger_manager.update_checkpoints(7.1, True)
 
-    trigger_manager.reuse_instance(None)
+    trigger_manager.reuse_instance()
 
 
-@pytest.mark.skip("To be updated")
 def test_trigger_manager_warnings(caplog: pytest.LogCaptureFixture,
                                   monkeypatch: pytest.MonkeyPatch):
     monkeypatch.setenv("MUSCLE_DISABLE_CHECKPOINT_ERRORS", "1")
@@ -209,15 +205,13 @@ def test_trigger_manager_warnings(caplog: pytest.LogCaptureFixture,
     trigger_manager.set_checkpoint_info(reference, Checkpoints(
             simulation_time=[CheckpointAtRule([1, 3, 5])]))
 
-    trigger_manager.reuse_instance(2)
+    trigger_manager.reuse_instance()
 
     with caplog.at_level(logging.WARN):
         n_records = len(caplog.records)
-        assert trigger_manager.should_save_snapshot(1.5, None)
-        assert len(caplog.records) == n_records + 1
-        assert "next_timestamp" in caplog.records[-1].message
+        assert trigger_manager.should_save_snapshot(1.5)
+        assert len(caplog.records) == n_records
 
-        n_records = len(caplog.records)
-        trigger_manager.reuse_instance(None)  # suppressed error
+        trigger_manager.reuse_instance()  # suppressed error
         assert len(caplog.records) > n_records
         assert "Suppressed checkpoint error" in caplog.records[-1].message

From 8b8f4fd9f95dacada20cc101c7142b46bf7e1588 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 17 Nov 2022 10:59:57 +0100
Subject: [PATCH 081/183] Update snapshot manager tests

---
 .../python/libmuscle/checkpoint_triggers.py   |  2 +-
 .../libmuscle/test/test_snapshot_manager.py   | 48 +++++++++----------
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py
index 61b6cdca..57e26ca7 100644
--- a/libmuscle/python/libmuscle/checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/checkpoint_triggers.py
@@ -302,7 +302,7 @@ def update_checkpoints(self, timestamp: float, final: bool) -> None:
             final: True iff this is coming from a save_final_snapshot call.
         """
         if not self._has_checkpoints:
-            _logger.info('Saving a snapshot, but no snapshots requested by the'
+            _logger.info('Saving a snapshot but no checkpoints requested by the'
                          ' workflow. Hint: use Instance.should_save_snapshot(),'
                          ' Instance.should_save_final_snapshot() or'
                          ' Instance.snapshots_enabled() to test if it is useful'
diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
index 462c4cd9..972e409b 100644
--- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py
+++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
@@ -11,7 +11,6 @@
 from libmuscle.snapshot_manager import SnapshotManager
 
 
-@pytest.mark.skip("To be updated")
 def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path
                           ) -> None:
     manager = MagicMock()
@@ -22,19 +21,19 @@ def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path
     snapshot_manager._set_checkpoint_info(
             datetime.now(timezone.utc), Checkpoints(), None)
 
-    snapshot_manager.reuse_instance(None, Path(tmp_path))
     assert not snapshot_manager.resuming()
-    assert not snapshot_manager.should_save_snapshot(1, None)
-    assert not snapshot_manager.should_save_snapshot(5000, None)
-    assert not snapshot_manager.should_save_final_snapshot(1000)
+    snapshot_manager.reuse_instance(tmp_path)
+    assert not snapshot_manager.resuming()
+    assert not snapshot_manager.should_save_snapshot(1)
+    assert not snapshot_manager.should_save_snapshot(5000)
+    assert not snapshot_manager.should_save_final_snapshot(False, None)
 
-    with caplog.at_level(logging.INFO, 'libmuscle.snapshot_manager'):
+    with caplog.at_level(logging.INFO, 'libmuscle'):
         snapshot_manager.save_snapshot(Message(1.0, None, None))
         assert caplog.records[0].levelname == "INFO"
         assert "no checkpoints" in caplog.records[0].message
 
 
-@pytest.mark.skip("To be updated")
 def test_save_load_checkpoint(tmp_path: Path) -> None:
     manager = MagicMock()
     communicator = MagicMock()
@@ -48,13 +47,14 @@ def test_save_load_checkpoint(tmp_path: Path) -> None:
     snapshot_manager._set_checkpoint_info(
             datetime.now(timezone.utc), checkpoints, None)
 
-    snapshot_manager.reuse_instance(None, tmp_path)
+    assert not snapshot_manager.resuming()
+    snapshot_manager.reuse_instance(tmp_path)
     with pytest.raises(RuntimeError):
         snapshot_manager.load_snapshot()
 
     assert not snapshot_manager.resuming()
-    assert snapshot_manager.should_save_snapshot(0.2, 0.4)
-    snapshot_manager.save_snapshot(Message(0.2, 0.4, 'test data'))
+    assert snapshot_manager.should_save_snapshot(0.2)
+    snapshot_manager.save_snapshot(Message(0.2, None, 'test data'))
 
     communicator.get_message_counts.assert_called_with()
     manager.submit_snapshot_metadata.assert_called()
@@ -64,30 +64,30 @@ def test_save_load_checkpoint(tmp_path: Path) -> None:
     assert metadata.triggers
     assert metadata.wallclock_time > 0.0
     assert metadata.timestamp == 0.2
-    assert metadata.next_timestamp == 0.4
+    assert metadata.next_timestamp is None
     assert metadata.port_message_counts == port_message_counts
     assert not metadata.is_final_snapshot
-    fpath = Path(metadata.snapshot_filename)
-    assert fpath.parent == tmp_path
-    assert fpath.name == 'test-1_1.pack'
+    snapshot_path = Path(metadata.snapshot_filename)
+    assert snapshot_path.parent == tmp_path
+    assert snapshot_path.name == 'test-1_1.pack'
 
     snapshot_manager2 = SnapshotManager(instance_id, manager, communicator)
 
     snapshot_manager2._set_checkpoint_info(
-            datetime.now(timezone.utc), checkpoints, fpath)
+            datetime.now(timezone.utc), checkpoints, snapshot_path)
     communicator.restore_message_counts.assert_called_with(port_message_counts)
 
     assert snapshot_manager2.resuming()
-    snapshot_manager2.reuse_instance(None, tmp_path)
+    snapshot_manager2.reuse_instance(tmp_path)
     assert snapshot_manager2.resuming()
     msg = snapshot_manager2.load_snapshot()
     assert msg.timestamp == 0.2
-    assert msg.next_timestamp == 0.4
+    assert msg.next_timestamp is None
     assert msg.data == 'test data'
 
-    assert not snapshot_manager2.should_save_snapshot(0.4, 0.6)
-    assert snapshot_manager2.should_save_final_snapshot(0.6)
-    snapshot_manager2.save_final_snapshot(Message(0.6, None, 'test data2'))
+    assert not snapshot_manager2.should_save_snapshot(0.4)
+    assert snapshot_manager2.should_save_final_snapshot(True, 1.2)
+    snapshot_manager2.save_final_snapshot(Message(0.6, None, 'test data2'), 1.2)
 
     instance, metadata = manager.submit_snapshot_metadata.call_args[0]
     assert instance == instance_id
@@ -98,10 +98,10 @@ def test_save_load_checkpoint(tmp_path: Path) -> None:
     assert metadata.next_timestamp is None
     assert metadata.port_message_counts == port_message_counts
     assert metadata.is_final_snapshot
-    fpath = Path(metadata.snapshot_filename)
-    assert fpath.parent == tmp_path
-    assert fpath.name == 'test-1_2.pack'
+    snapshot_path = Path(metadata.snapshot_filename)
+    assert snapshot_path.parent == tmp_path
+    assert snapshot_path.name == 'test-1_2.pack'
 
     assert snapshot_manager2.resuming()
-    snapshot_manager2.reuse_instance(None, tmp_path)
+    snapshot_manager2.reuse_instance(tmp_path)
     assert not snapshot_manager2.resuming()

From d60c5e6680c63818ba4c642376d786d97bfeb796 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 17 Nov 2022 17:38:52 +0100
Subject: [PATCH 082/183] Update snapshot registry and tests

- Stateless actors should also send metadata, so no longer special-cased
- Workflow snapshot detection algorithm does an exhaustive search
- Use frozenset wherever possible
---
 .../libmuscle/manager/snapshot_registry.py    | 178 +++++++++---------
 .../manager/test/test_snapshot_registry.py    | 134 +++++++------
 2 files changed, 148 insertions(+), 164 deletions(-)

diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py
index ed1618e3..6bdad7fa 100644
--- a/libmuscle/python/libmuscle/manager/snapshot_registry.py
+++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py
@@ -7,11 +7,11 @@
 from pathlib import Path
 from queue import Queue
 from threading import Thread
-from typing import Dict, Optional, Set, List, Tuple, TypeVar
+from typing import Dict, Optional, Set, FrozenSet, List, Tuple, TypeVar
 
 from ymmsl import (
         Reference, Model, Identifier, Implementation, save,
-        PartialConfiguration, ImplementationState as IState)
+        PartialConfiguration)
 
 from libmuscle.manager.topology_store import TopologyStore
 from libmuscle.snapshot import SnapshotMetadata
@@ -91,15 +91,14 @@ class SnapshotNode:
             snapshots always have a higher num.
         instance: Which instance this is a snapshot of.
         snapshot: The snapshot metadata reported by the instance.
-        stateful_peers: The set of peers that the instance is connected to that
-            have state, which we need to check consistency with.
+        peers: The set of peers that the instance is connected to.
         consistent_peers: Keeps track of snapshots per peer that are consistent
             with this one.
     """
     num: int
     instance: Reference
     snapshot: SnapshotMetadata
-    stateful_peers: Set[Reference]
+    peers: FrozenSet[Reference]
     consistent_peers: Dict[Reference, List["SnapshotNode"]] = field(
             default_factory=dict, repr=False)
 
@@ -108,10 +107,9 @@ def __hash__(self) -> int:
 
     @property
     def consistent(self) -> bool:
-        """Returns True iff there is a consistent checkpoint will all stateful
-        peers.
+        """Returns True iff there is a consistent checkpoint with all peers.
         """
-        return self.consistent_peers.keys() == self.stateful_peers
+        return self.consistent_peers.keys() == self.peers
 
     def do_consistency_check(
             self,
@@ -194,12 +192,9 @@ def __init__(
         self._snapshots = {}                # type: _SnapshotDictType
 
         self._instances = set()             # type: Set[Reference]
-        self._stateful_instances = set()    # type: Set[Reference]
         for component in config.model.components:
-            instances = set(component.instances())
-            self._instances.update(instances)
-            if self._is_stateful(component.name):
-                self._stateful_instances.update(instances)
+            self._instances.update(component.instances())
+        # TODO: create snapshot nodes for starting from scratch
 
     def register_snapshot(
             self, instance: Reference, snapshot: SnapshotMetadata) -> None:
@@ -233,7 +228,7 @@ def _add_snapshot(
             instance: The instance that created the snapshot
             snapshot: Metadata describing the snapshot
         """
-        stateful_peers = self._get_stateful_peers(instance)
+        stateful_peers = self._get_peers(instance)
 
         i_snapshots = self._snapshots.setdefault(instance, [])
         # get next number of the snapshot
@@ -257,45 +252,44 @@ def _save_workflow_snapshot(self, snapshotnode: SnapshotNode) -> None:
             snapshotnode: The snapshot node that must be part of the workflow
                 snapshot.
         """
-        selected_snapshots = self._get_workflow_snapshot(snapshotnode)
-        if selected_snapshots is not None:
-            self._write_snapshot_ymmsl(selected_snapshots)
-            self._cleanup_snapshots(selected_snapshots)
+        workflow_snapshots = self._get_workflow_snapshots(snapshotnode)
+        for workflow_snapshot in workflow_snapshots:
+            self._write_snapshot_ymmsl(workflow_snapshot)
+        self._cleanup_snapshots(workflow_snapshots)
 
-    def _get_workflow_snapshot(
-            self, snapshot: SnapshotNode) -> Optional[List[SnapshotNode]]:
-        """Check if a workflow snapshot exists that contains the provided node.
-
-        Note: if the provided snapshot node is part of multiple workflow
-        snapshots, only the most recent is detected and written to disk.
+    def _get_workflow_snapshots(
+            self, snapshot: SnapshotNode) -> List[List[SnapshotNode]]:
+        """Return all workflow snapshots which contain the provided node.
 
         Args:
             snapshotnode: The snapshot node that must be part of the workflow
                 snapshot.
+
+        Returns:
+            List of workflow snapshots. Each workflow snapshot is a list of
+            instance snapshot nodes.
         """
-        # This implements a greedy assignment algorithm.
         if not snapshot.consistent:
-            return None
+            return []
 
         # Instances that don't have a snapshot node chosen yet:
-        instances_to_cover = list(
-                self._stateful_instances - {snapshot.instance})
+        instances_to_cover = list(self._instances - {snapshot.instance})
         # Allowed snapshots per instance. This is updated during the heuristic
         # to further restrict the sets of snapshots as peer snapshots are
         # selected.
         # First restriction is that the snapshots have to be locally consistent.
-        allowed_snapshots = {}  # type: Dict[Reference, Set[SnapshotNode]]
+        allowed_snapshots = {}  # type: Dict[Reference, FrozenSet[SnapshotNode]]
         for instance in instances_to_cover:
-            allowed_snapshots[instance] = set(
+            allowed_snapshots[instance] = frozenset(
                     i_snapshot
                     for i_snapshot in self._snapshots.get(instance, [])
                     if i_snapshot.consistent)
             if not allowed_snapshots[instance]:
                 # there cannot be a workflow snapshot if this instance has no
                 # consistent snapshot nodes
-                return None
+                return []
         instance = snapshot.instance
-        allowed_snapshots[instance] = {snapshot}
+        allowed_snapshots[instance] = frozenset({snapshot})
 
         def num_allowed_snapshots(instance: Reference) -> int:
             """Get number of allowed snapshots at this point for this instance.
@@ -305,18 +299,23 @@ def num_allowed_snapshots(instance: Reference) -> int:
             """
             return len(allowed_snapshots[instance])
 
+        # Do a full, depth-first search for all workflow snapshots
+        # ========================================================
+
+        workflow_snapshots = []
         selected_snapshots = [snapshot]
         # This stack stores history of allowed_snapshots and enables roll back
-        stack = []  # type: List[Dict[Reference, Set[SnapshotNode]]]
+        stack = []  # type: List[Dict[Reference, FrozenSet[SnapshotNode]]]
 
-        # update allowed_snapshots for peers
+        # Update allowed_snapshots for peers of the selected snapshot
         for peer, snapshots in snapshot.consistent_peers.items():
-            allowed_snapshots[peer].intersection_update(snapshots)
-            if not allowed_snapshots[peer]:
-                return None
+            intersection = allowed_snapshots[peer].intersection(snapshots)
+            if not intersection:
+                return []
+            allowed_snapshots[peer] = intersection
 
-        while instances_to_cover:
-            # select most constrained instance
+        while True:
+            # 1. Select most constrained instance
             #
             # Note: we're only interested in the instance with the least allowed
             # snapshots. Better performance may be possible by not doing a full
@@ -331,44 +330,46 @@ def num_allowed_snapshots(instance: Reference) -> int:
             instances_to_cover.sort(key=num_allowed_snapshots, reverse=True)
             instance = instances_to_cover.pop()
 
-            # select latest snapshot of this instance
-            snapshot = max(allowed_snapshots[instance], key=attrgetter("num"))
+            # 2. Select the oldest snapshot of this instance
+            snapshot = min(allowed_snapshots[instance], key=attrgetter('num'))
             selected_snapshots.append(snapshot)
-            # we put a shallow copy on the stack, so we are not allowed to
-            # modify the sets in the dictionary (see below)
+            # A shallow copy is ok: the values are immutable frozensets
             stack.append(allowed_snapshots.copy())
 
-            # update allowed snapshots with the currently selected
-            allowed_snapshots[instance] = {snapshot}
+            # 3. Update allowed snapshots based on the newly selected
+            allowed_snapshots[instance] = frozenset({snapshot})
             for peer, snapshots in snapshot.consistent_peers.items():
-                # not updating in place to preserve set objects in the stack
                 intersection = allowed_snapshots[peer].intersection(snapshots)
                 if not intersection:
                     break  # roll back
                 allowed_snapshots[peer] = intersection
             else:
-                # not rolling back, go into next iteration of the while-loop
-                continue
+                # 4. Selected snapshot is okay to explore further
+                if instances_to_cover:
+                    # 4a. There are still instance to cover, return to the start
+                    #     of the while loop.
+                    continue
+                # 4b. We have found a complete workflow snapshot
+                workflow_snapshots.append(selected_snapshots.copy())
+                # Next: perform a roll-back to continue the search
 
-            # roll back should stop when selected_snapshots only contains the
-            # one we forced to be part of the workflow snapshot
+            # 5. Roll back
+            # stop when selected_snapshots only contains the one we forced to be
+            # part of the workflow snapshot
             while len(selected_snapshots) > 1:
-                # roll back
                 snapshot = selected_snapshots.pop()
                 instance = snapshot.instance
                 instances_to_cover.append(instance)
                 allowed_snapshots = stack.pop()
-                allowed_snapshots[instance].remove(snapshot)
-                if allowed_snapshots[instance]:
-                    # we have a valid next snapshot to try for this instance
+                intersection = allowed_snapshots[instance] - {snapshot}
+                allowed_snapshots[instance] = intersection
+                if intersection:
+                    # We have a valid next snapshot to try for this instance
                     break
-                # no allowed_snapshots, try another roll back
+                # No allowed_snapshots, try another roll back
             else:
-                # we've exhausted roll back possibilities, there is no
-                # consistent checkpoint
-                return None
-
-        return selected_snapshots
+                # Exhausted all roll back possibilities, so we are done now
+                return workflow_snapshots
 
     def _write_snapshot_ymmsl(
             self, selected_snapshots: List[SnapshotNode]) -> None:
@@ -436,20 +437,32 @@ def _generate_description(
                 '\n'.join(component_table))
 
     def _cleanup_snapshots(
-            self, selected_snapshots: List[SnapshotNode]) -> None:
+            self, workflow_snapshots: List[List[SnapshotNode]]) -> None:
         """Remove all snapshots that are older than the selected snapshots.
 
         Args:
             selected_snapshots: All snapshot nodes of a workflow snapshot
         """
-        # remove all snapshots older than the selected ones
+        if not workflow_snapshots:
+            return
+
+        # Find the newest snapshots per instance
+        newest_snapshots = {snapshot.instance: snapshot
+                            for snapshot in workflow_snapshots[0]}
+        for workflow_snapshot in workflow_snapshots[1:]:
+            for snapshot in workflow_snapshot:
+                if newest_snapshots[snapshot.instance].num < snapshot.num:
+                    newest_snapshots[snapshot.instance] = snapshot
+
+        # Remove all snapshots that are older than the newest snapshots
         removed_snapshots = set()  # type: Set[SnapshotNode]
-        for snapshot in selected_snapshots:
+        for snapshot in newest_snapshots.values():
             all_snapshots = self._snapshots[snapshot.instance]
             idx = all_snapshots.index(snapshot)
             self._snapshots[snapshot.instance] = all_snapshots[idx:]
             removed_snapshots.update(all_snapshots[:idx])
-        # remove all references in SnapshotNode.peer_snapshot to the snapshots
+
+        # Remove all references in SnapshotNode.peer_snapshot to the snapshots
         # that are cleaned up
         for snapshot in removed_snapshots:
             for peer_snapshot in chain.from_iterable(
@@ -462,23 +475,19 @@ def _cleanup_snapshots(
                         snapshot)
 
     @lru_cache(maxsize=None)
-    def _get_stateful_peers(self, instance: Reference) -> Set[Reference]:
-        """Return the set of stateful peers for the given instance.
+    def _get_peers(self, instance: Reference) -> FrozenSet[Reference]:
+        """Return the set of peers for the given instance.
 
-        Note: instance is assumed to contain the full index, not just the kernel
-        name.
+        Note: instance is assumed to contain the full index, not just the
+        component name.
 
         Args:
-            instance: Instance to get stateful peers of. See
-                :meth:`_is_stateful`.
+            instance: Instance to get peers of.
 
         Returns:
-            Set with all stateful peer instances (including their index).
+            Frozen set with all peer instances (including their index).
         """
-        return set(
-                peer
-                for peer in self._topology_store.get_peer_instances(instance)
-                if self._is_stateful(peer.without_trailing_ints()))
+        return frozenset(self._topology_store.get_peer_instances(instance))
 
     @lru_cache(maxsize=None)
     def _get_connections(self, instance: Reference, peer: Reference
@@ -555,22 +564,3 @@ def _implementation(self, kernel: Reference) -> Optional[Implementation]:
         if implementation in self._configuration.implementations:
             return self._configuration.implementations[implementation]
         return None
-
-    @lru_cache(maxsize=None)
-    def _is_stateful(self, kernel: Reference) -> bool:
-        """Check if a kernel has a stateful implementation.
-
-        A kernel is considered stateful if:
-        - There is no Implementation given for the kernel
-        - Implementation.stateful = ImplementationState.STATEFUL
-        - Implementation.stateful = ImplementationState.WEAKLY_STATEFUL and the
-            implementation supports checkpointing. In this case we assume to get
-            snapshots from these kernels and we take them into account in the
-            snapshot graph.
-        """
-        implementation = self._implementation(kernel)
-        if implementation is None:
-            return True  # assume stateful
-        return (implementation.stateful is IState.STATEFUL or
-                implementation.stateful is IState.WEAKLY_STATEFUL and
-                implementation.supports_checkpoint)
diff --git a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py
index 71e3fb7c..dd6c0c46 100644
--- a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py
+++ b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py
@@ -158,25 +158,23 @@ def test_snapshot_config():
     print(config.description)
 
 
-def test_stateful_peers(uq: Configuration, micro_is_stateless: bool) -> None:
+def test_peers(uq: Configuration) -> None:
     snapshot_registry = SnapshotRegistry(uq, None, TopologyStore(uq))
     macro = Reference('macro')
     micro = Reference('micro')
     qmc = Reference('qmc')
     rr = Reference('rr')
 
-    expected_stateful = {qmc, rr} | {macro + i for i in range(5)}
-    if not micro_is_stateless:
-        expected_stateful.update(micro + i for i in range(5))
-    assert snapshot_registry._stateful_instances == expected_stateful
+    all_instances = {qmc, rr} | {macro + i for i in range(5)}
+    all_instances.update(micro + i for i in range(5))
+    assert snapshot_registry._instances == all_instances
 
-    assert snapshot_registry._get_stateful_peers(qmc) == {rr}
+    assert snapshot_registry._get_peers(qmc) == {rr}
     expected_rr_peers = {qmc} | {macro + i for i in range(5)}
-    assert snapshot_registry._get_stateful_peers(rr) == expected_rr_peers
+    assert snapshot_registry._get_peers(rr) == expected_rr_peers
     for i in range(5):
-        expected_peers = {rr} if micro_is_stateless else {rr, micro + i}
-        assert snapshot_registry._get_stateful_peers(macro + i) == expected_peers
-        assert snapshot_registry._get_stateful_peers(micro + i) == {macro + i}
+        assert snapshot_registry._get_peers(macro + i) == {rr, micro + i}
+        assert snapshot_registry._get_peers(micro + i) == {macro + i}
 
 
 def test_connections(uq: Configuration) -> None:
@@ -238,19 +236,7 @@ def test_implementation(uq: Configuration) -> None:
     assert missing_impl is None
 
 
-def test_stateful(uq: Configuration, micro_is_stateless: bool) -> None:
-    uq.implementations['macro_impl'].stateful = IState.WEAKLY_STATEFUL
-    snapshot_registry = SnapshotRegistry(uq, None, TopologyStore(uq))
-
-    assert snapshot_registry._is_stateful(Reference('macro'))
-    stateful = snapshot_registry._is_stateful(Reference('micro'))
-    assert stateful is not micro_is_stateless
-
-    assert snapshot_registry._is_stateful(Reference('unknown'))
-
-
-def test_macro_micro_snapshots(
-        macro_micro: Configuration, micro_is_stateless: bool) -> None:
+def test_macro_micro_snapshots(macro_micro: Configuration) -> None:
     snapshot_registry = SnapshotRegistry(
             macro_micro, None, TopologyStore(macro_micro))
     # prevent actually writing a ymmsl file, testing that separately
@@ -263,58 +249,71 @@ def test_macro_micro_snapshots(
 
     assert len(snapshot_registry._snapshots[macro]) == 1
     node = snapshot_registry._snapshots[macro][0]
-    assert node.consistent is micro_is_stateless
+    assert node.consistent is False
     assert node.consistent_peers == {}
     assert node.instance == macro
     assert node.num == 1
     assert node.snapshot is macro_snapshot
-    if micro_is_stateless:
-        assert node.stateful_peers == set()
-        snapshot_registry._write_snapshot_ymmsl.assert_called_once_with([node])
-        snapshot_registry._write_snapshot_ymmsl.reset_mock()
-    else:
-        assert node.stateful_peers == {micro}
-        snapshot_registry._write_snapshot_ymmsl.assert_not_called()
+    assert node.peers == {micro}
+    snapshot_registry._write_snapshot_ymmsl.assert_not_called()
 
-    if not micro_is_stateless:
-        # Note: this snapshot is not realistic, it should have come in before
-        # the macro snapshot above. However, it's still useful for testing the
-        # consistency algorithm
-        micro_snapshot = make_snapshot(f_i=[2], o_f=[1])
-        snapshot_registry._add_snapshot(micro, micro_snapshot)
+    # Note: this snapshot is not realistic, it should have come in before
+    # the macro snapshot above. However, it's still useful for testing the
+    # consistency algorithm
+    micro_snapshot = make_snapshot(f_i=[2], o_f=[1])
+    snapshot_registry._add_snapshot(micro, micro_snapshot)
 
-        assert len(snapshot_registry._snapshots[micro]) == 1
-        assert not snapshot_registry._snapshots[micro][0].consistent
-        snapshot_registry._write_snapshot_ymmsl.assert_not_called()
+    assert len(snapshot_registry._snapshots[micro]) == 1
+    assert snapshot_registry._snapshots[micro][0].consistent is False
+    snapshot_registry._write_snapshot_ymmsl.assert_not_called()
 
-        micro_snapshot = make_snapshot(f_i=[3], o_f=[2])
-        snapshot_registry._add_snapshot(micro, micro_snapshot)
+    micro_snapshot = make_snapshot(f_i=[3], o_f=[2])
+    snapshot_registry._add_snapshot(micro, micro_snapshot)
 
-        # micro snapshots should be cleaned up now!
-        assert len(snapshot_registry._snapshots[micro]) == 1
-        micro_node = snapshot_registry._snapshots[micro][0]
-        assert micro_node.consistent
-        snapshot_registry._write_snapshot_ymmsl.assert_called_with(
-                [micro_node, node])
-        snapshot_registry._write_snapshot_ymmsl.reset_mock()
+    # The first micro snapshots should be cleaned up now
+    assert len(snapshot_registry._snapshots[micro]) == 1
+    micro_node = snapshot_registry._snapshots[micro][0]
+    assert micro_node.consistent
+    snapshot_registry._write_snapshot_ymmsl.assert_called_once_with(
+            [micro_node, node])
+    snapshot_registry._write_snapshot_ymmsl.reset_mock()
 
+    # 3 micro snapshots in the same reuse:
+    for _ in range(3):
         micro_snapshot = make_snapshot(f_i=[4], o_f=[3])
         snapshot_registry._add_snapshot(micro, micro_snapshot)
 
-        # micro snapshots should be cleaned up now!
-        assert len(snapshot_registry._snapshots[micro]) == 1
-        micro_node = snapshot_registry._snapshots[micro][0]
-        assert micro_node.consistent
-        snapshot_registry._write_snapshot_ymmsl.assert_called_with(
-                [micro_node, node])
-        snapshot_registry._write_snapshot_ymmsl.reset_mock()
+    # Previous micro snapshot should be cleaned up now
+    assert len(snapshot_registry._snapshots[micro]) == 1
+    micro_node = snapshot_registry._snapshots[micro][-1]
+    assert snapshot_registry._write_snapshot_ymmsl.call_count == 3
+    snapshot_registry._write_snapshot_ymmsl.assert_called_with(
+            [micro_node, node])
+    snapshot_registry._write_snapshot_ymmsl.reset_mock()
 
     macro_snapshot = make_snapshot(o_i=[4], s=[4])
     snapshot_registry._add_snapshot(macro, macro_snapshot)
     snapshot_registry._write_snapshot_ymmsl.assert_called_once()
+    snapshot_registry._write_snapshot_ymmsl.reset_mock()
 
+    # 3 micro snapshots in the same reuse, but inconcistent with previous macro
+    for _ in range(3):
+        micro_snapshot = make_snapshot(f_i=[6], o_f=[5])
+        snapshot_registry._add_snapshot(micro, micro_snapshot)
 
-def test_uq(uq: Configuration, micro_is_stateless: bool) -> None:
+    # All three should be present now in addition to the one last used in
+    # workflow snapshot
+    assert len(snapshot_registry._snapshots[micro]) == 4
+    snapshot_registry._write_snapshot_ymmsl.assert_not_called()
+
+    macro_snapshot = make_snapshot(o_i=[6], s=[6])
+    snapshot_registry._add_snapshot(macro, macro_snapshot)
+    assert snapshot_registry._write_snapshot_ymmsl.call_count == 3
+    assert len(snapshot_registry._snapshots[micro]) == 1
+    assert len(snapshot_registry._snapshots[macro]) == 1
+
+
+def test_uq(uq: Configuration) -> None:
     snapshot_registry = SnapshotRegistry(uq, None, TopologyStore(uq))
     # prevent actually writing a ymmsl file, testing that separately
     snapshot_registry._write_snapshot_ymmsl = MagicMock()
@@ -342,24 +341,19 @@ def test_uq(uq: Configuration, micro_is_stateless: bool) -> None:
         snapshot_registry._add_snapshot(macro + i, macro_snapshot)
         node = snapshot_registry._snapshots[macro + i][-1]
         assert node.consistent_peers.keys() == {rr}
-        if micro_is_stateless and i == 4:
+        snapshot_registry._write_snapshot_ymmsl.assert_not_called()
+
+    micro_snapshot = make_snapshot(f_i=[1], o_f=[0])
+    for i in range(5):
+        snapshot_registry._add_snapshot(micro + i, micro_snapshot)
+        node = snapshot_registry._snapshots[micro + i][-1]
+        assert node.consistent_peers.keys() == {macro + i}
+        if i == 4:
             snapshot_registry._write_snapshot_ymmsl.assert_called_once()
             snapshot_registry._write_snapshot_ymmsl.reset_mock()
         else:
             snapshot_registry._write_snapshot_ymmsl.assert_not_called()
 
-    if not micro_is_stateless:
-        micro_snapshot = make_snapshot(f_i=[1], o_f=[0])
-        for i in range(5):
-            snapshot_registry._add_snapshot(micro + i, micro_snapshot)
-            node = snapshot_registry._snapshots[micro + i][-1]
-            assert node.consistent_peers.keys() == {macro + i}
-            if i == 4:
-                snapshot_registry._write_snapshot_ymmsl.assert_called_once()
-                snapshot_registry._write_snapshot_ymmsl.reset_mock()
-            else:
-                snapshot_registry._write_snapshot_ymmsl.assert_not_called()
-
     qmc_snapshot = make_snapshot(parameters_out=[1, 1, 1, 1, 1], states_in=[])
     snapshot_registry._add_snapshot(qmc, qmc_snapshot)
     node = snapshot_registry._snapshots[qmc][-1]

From 4a3e9c5991f7bb0b1f210ca7d38236fe1b71f61a Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 17 Nov 2022 17:51:00 +0100
Subject: [PATCH 083/183] Move deregister from manager to reuse_instance()

---
 libmuscle/python/libmuscle/instance.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index efbb9b92..a7e5ec63 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -146,6 +146,10 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool:
             snapshot_path = None
         self._snapshot_manager.reuse_instance(snapshot_path)
 
+        if not do_reuse:
+            self._deregister()
+            self.__manager.close()
+
         return do_reuse
 
     def error_shutdown(self, message: str) -> None:
@@ -677,8 +681,6 @@ def __check_reuse_instance(self, apply_overlay: bool) -> bool:
         if not do_reuse:
             self.__close_ports()
             self._communicator.shutdown()
-            self._deregister()
-            self.__manager.close()
         return do_reuse
 
     def __receive_message(

From 3da1e3e13353c25203027f1d33be479ce9830428 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Fri, 18 Nov 2022 19:03:13 +0100
Subject: [PATCH 084/183] Run native compatibility test also with older OS
 versions

---
 .github/workflows/ci_ubuntu18.04_clang.yaml   | 19 +++++++++++++++++++
 .github/workflows/ci_ubuntu20.04.yaml         |  2 +-
 .github/workflows/ci_ubuntu20.04_clang.yaml   | 19 +++++++++++++++++++
 ...4_clang.yaml => ci_ubuntu22.04_clang.yaml} |  3 +--
 4 files changed, 40 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/ci_ubuntu18.04_clang.yaml
 create mode 100644 .github/workflows/ci_ubuntu20.04_clang.yaml
 rename .github/workflows/{ci_ubuntu_22.04_clang.yaml => ci_ubuntu22.04_clang.yaml} (95%)

diff --git a/.github/workflows/ci_ubuntu18.04_clang.yaml b/.github/workflows/ci_ubuntu18.04_clang.yaml
new file mode 100644
index 00000000..49864bc2
--- /dev/null
+++ b/.github/workflows/ci_ubuntu18.04_clang.yaml
@@ -0,0 +1,19 @@
+# Run Continuous Integration for the latest Ubuntu release
+# This mainly checks for issues/regressions in the native build
+name: native_compatibility_ubuntu18.04_clang
+on:
+  schedule:
+    - cron: '30 2 * * 0'
+  push:
+    branches:
+      - 'release-*'
+      - fix_native_compatibility_ci
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Run tests on Ubuntu 18.04 with Clang
+      run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:18.04 /bin/bash -c 'apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential clang cmake gfortran git valgrind libopenmpi-dev pkg-config python3 python3-pip python3-venv curl && apt-get -y remove libssl-dev zlib1g-dev && useradd -m -d /home/muscle3 muscle3 && echo "Added user" && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && echo "Copied files" && su muscle3 -c -- "pip3 install --user -U \"pip<22\" setuptools wheel" && su muscle3 -c -- "pip3 install --user \"ymmsl>=0.12.0,<0.13\" qcg-pilotjob==0.13.1" && su muscle3 -c -- "cd /home/muscle3/muscle3 && CXXFLAGS=-fPIE OMPI_CXX=clang++ CXX=clang++ make test_examples"'
diff --git a/.github/workflows/ci_ubuntu20.04.yaml b/.github/workflows/ci_ubuntu20.04.yaml
index c9d0c595..f3c51d10 100644
--- a/.github/workflows/ci_ubuntu20.04.yaml
+++ b/.github/workflows/ci_ubuntu20.04.yaml
@@ -3,7 +3,7 @@
 name: native_compatibility_ubuntu20.04
 on:
   schedule:
-    - cron: '0 4 * * 0'
+    - cron: '0 3 * * 0'
   push:
     branches:
       - 'release-*'
diff --git a/.github/workflows/ci_ubuntu20.04_clang.yaml b/.github/workflows/ci_ubuntu20.04_clang.yaml
new file mode 100644
index 00000000..749aca38
--- /dev/null
+++ b/.github/workflows/ci_ubuntu20.04_clang.yaml
@@ -0,0 +1,19 @@
+# Run Continuous Integration for the latest Ubuntu release
+# This mainly checks for issues/regressions in the native build
+name: native_compatibility_ubuntu20.04_clang
+on:
+  schedule:
+    - cron: '30 3 * * 0'
+  push:
+    branches:
+      - 'release-*'
+      - fix_native_compatibility_ci
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Run tests on Ubuntu 20.04 with Clang
+      run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:20.04 /bin/bash -c 'apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential clang cmake gfortran git valgrind libopenmpi-dev pkg-config python3 python3-pip python3-venv curl && apt-get -y remove libssl-dev zlib1g-dev && useradd -m -d /home/muscle3 muscle3 && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && su muscle3 -c -- "pip3 install --user -U \"pip<22\" setuptools wheel" && su muscle3 -c -- "pip3 install --user \"ymmsl>=0.12.0,<0.13\" qcg-pilotjob==0.13.1" && su muscle3 -c -- "cd /home/muscle3/muscle3 && CXXFLAGS=-fPIE OMPI_CXX=clang++ CXX=clang++ make test_examples"'
diff --git a/.github/workflows/ci_ubuntu_22.04_clang.yaml b/.github/workflows/ci_ubuntu22.04_clang.yaml
similarity index 95%
rename from .github/workflows/ci_ubuntu_22.04_clang.yaml
rename to .github/workflows/ci_ubuntu22.04_clang.yaml
index 125b3fe6..3c20e9e8 100644
--- a/.github/workflows/ci_ubuntu_22.04_clang.yaml
+++ b/.github/workflows/ci_ubuntu22.04_clang.yaml
@@ -3,12 +3,11 @@
 name: native_compatibility_ubuntu22.04_clang
 on:
   schedule:
-    - cron: '0 3 * * 0'
+    - cron: '30 4 * * 0'
   push:
     branches:
       - 'release-*'
       - fix_native_compatibility_ci
-      - feature/clang_build
 jobs:
   build:
     runs-on: ubuntu-latest

From 1c0f6820833bb17b540502ca43478bbe7a746c2b Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Sat, 19 Nov 2022 19:38:51 +0100
Subject: [PATCH 085/183] Add building with clang to documentation (thanks
 Maarten)

---
 docs/source/installing.rst.in | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/docs/source/installing.rst.in b/docs/source/installing.rst.in
index 24c0dcf2..cbafedad 100644
--- a/docs/source/installing.rst.in
+++ b/docs/source/installing.rst.in
@@ -86,6 +86,12 @@ helpdesk. ``cmake`` is only needed to build the MessagePack dependency, so if
 that's already available then you don't need ``cmake```. On a cluster, there is
 usually a ``cmake`` module to load.
 
+MUSCLE3 can be built with **clang** as well, if you prefer. You'll need to
+install it using something like ``sudo apt-get install clang``, and modify the
+build command a bit, see below. Note that clang does not have a production-ready
+Fortran compiler yet, but the commands below will help you build the C++ part
+with clang, and the Fortran part with gfortran.
+
 If your submodels use MPI, then you'll need to compile the MPI support for
 MUSCLE3. This requires an MPI library to be available. Libmuscle has been
 tested with OpenMPI on Ubuntu, but should work with other MPI implementations
@@ -203,6 +209,20 @@ As an example, to build libmuscle using 2 cores, you would do:
 This will take a few minutes (including building the dependencies), depending on
 the speed of your machine.
 
+**Building with clang**
+
+To build with clang, use
+
+.. code-block:: bash
+
+  ~/mucle3_source/muscle3-0.5.0$ CXXFLAGS=-fPIE OMPI_CXX=clang++ CXX=clang++ make
+
+
+This will tell the build system to use clang for compiling the C++ code and its
+MPI support, but still use gfortran to compile the Fortran code (if gfortran is
+installed). The extra ``-fPIE`` switch is needed to make that combination work
+on some common platforms.
+
 
 Getting help
 ````````````

From ee5fbc1714e6f4cebd1c8e86512b8275f9d5e491 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 21 Nov 2022 13:22:27 +0100
Subject: [PATCH 086/183] Fix checkpointing bugs

---
 libmuscle/python/libmuscle/instance.py        | 10 ++---
 .../python/libmuscle/snapshot_manager.py      | 44 ++++++++++++-------
 .../libmuscle/test/test_snapshot_manager.py   |  3 +-
 3 files changed, 35 insertions(+), 22 deletions(-)

diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index a7e5ec63..379067c8 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -147,6 +147,8 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool:
         self._snapshot_manager.reuse_instance(snapshot_path)
 
         if not do_reuse:
+            self.__close_ports()
+            self._communicator.shutdown()
             self._deregister()
             self.__manager.close()
 
@@ -568,7 +570,7 @@ def save_final_snapshot(self, message: Message) -> None:
                 (msg.timestamp for msg in self._f_init_cache.values()),
                 default=None)
         return self._snapshot_manager.save_final_snapshot(
-                message, f_init_max_timestamp)
+                message, f_init_max_timestamp, self._do_reuse)
 
     def _register(self) -> None:
         """Register this instance with the manager.
@@ -657,8 +659,7 @@ def __check_reuse_instance(self, apply_overlay: bool) -> bool:
         # TODO: _f_init_cache should be empty here, or the user didn't
         # receive something that was sent on the last go-around.
         # At least emit a warning.
-        if not (self.resuming() and self._first_run):
-            # when resuming we skip receiving on f_init in the first run
+        if self.should_init() or not self._first_run:
             self.__pre_receive_f_init(apply_overlay)
 
         self._set_local_log_level()
@@ -678,9 +679,6 @@ def __check_reuse_instance(self, apply_overlay: bool) -> bool:
                     do_reuse = False
         self._first_run = False
 
-        if not do_reuse:
-            self.__close_ports()
-            self._communicator.shutdown()
         return do_reuse
 
     def __receive_message(
diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py
index 5f67bfed..9b4c2da3 100644
--- a/libmuscle/python/libmuscle/snapshot_manager.py
+++ b/libmuscle/python/libmuscle/snapshot_manager.py
@@ -1,9 +1,9 @@
 import logging
 from datetime import datetime
 from pathlib import Path
-from typing import Optional, cast
+from typing import Optional
 
-from ymmsl import Checkpoints, Reference
+from ymmsl import Checkpoints, Reference, Operator
 
 from libmuscle.checkpoint_triggers import TriggerManager
 from libmuscle.communicator import Communicator, Message
@@ -65,8 +65,8 @@ def _set_checkpoint_info(self,
         """
         self._trigger_manager.set_checkpoint_info(utc_reference, checkpoints)
         if resume is not None:
-            self.__load_snapshot(resume)
-            snapshot = cast(Snapshot, self._resume_from_snapshot)
+            snapshot = self.load_snapshot_from_file(resume)
+            self._resume_from_snapshot = snapshot
             self._communicator.restore_message_counts(
                 snapshot.port_message_counts)
             self._trigger_manager.update_checkpoints(
@@ -136,14 +136,16 @@ def save_snapshot(self, msg: Message) -> None:
         self.__save_snapshot(msg, False)
 
     def save_final_snapshot(
-            self, msg: Message, f_init_max_timestamp: Optional[float]) -> None:
+            self, msg: Message, f_init_max_timestamp: Optional[float],
+            do_reuse: Optional[bool]) -> None:
         """Save final snapshot contained in the message object
         """
-        self.__save_snapshot(msg, True, f_init_max_timestamp)
+        self.__save_snapshot(msg, True, f_init_max_timestamp, do_reuse)
 
     def __save_snapshot(
             self, msg: Message, final: bool,
-            f_init_max_timestamp: Optional[float] = None
+            f_init_max_timestamp: Optional[float] = None,
+            do_reuse: Optional[bool] = None
             ) -> None:
         """Actual implementation used by save_(final_)snapshot.
 
@@ -155,6 +157,18 @@ def __save_snapshot(
         wallclock_time = self._trigger_manager.elapsed_walltime()
 
         port_message_counts = self._communicator.get_message_counts()
+        if final:
+            # Decrease F_INIT port counts by one: F_INIT messages are already
+            # pre-received, but not yet processed by the user code. Therefore,
+            # the snapshot state should treat these as not-received.
+            all_ports = self._communicator.list_ports()
+            ports = all_ports.get(Operator.F_INIT, [])
+            if self._communicator.settings_in_connected():
+                ports.append('muscle_settings_in')
+            for port_name in ports:
+                new_counts = [i - 1 for i in port_message_counts[port_name]]
+                port_message_counts[port_name] = new_counts
+
         snapshot = MsgPackSnapshot(
             triggers, wallclock_time, port_message_counts, final, msg)
 
@@ -169,7 +183,8 @@ def __save_snapshot(
             timestamp = f_init_max_timestamp
         self._trigger_manager.update_checkpoints(timestamp, final)
 
-    def __load_snapshot(self, snapshot_location: Path) -> None:
+    @staticmethod
+    def load_snapshot_from_file(snapshot_location: Path) -> Snapshot:
         """Load a previously stored snapshot from the filesystem
 
         Args:
@@ -186,13 +201,12 @@ def __load_snapshot(self, snapshot_location: Path) -> None:
             data = snapshot_file.read()
 
             if version == MsgPackSnapshot.SNAPSHOT_VERSION_BYTE:
-                self._resume_from_snapshot = MsgPackSnapshot.from_bytes(data)
-            else:
-                raise RuntimeError('Unable to load snapshot from'
-                                   f' {snapshot_location}: unknown version of'
-                                   ' snapshot file. Was the file saved with a'
-                                   ' different version of libmuscle or'
-                                   ' tampered with?')
+                return MsgPackSnapshot.from_bytes(data)
+            raise RuntimeError('Unable to load snapshot from'
+                               f' {snapshot_location}: unknown version of'
+                               ' snapshot file. Was the file saved with a'
+                               ' different version of libmuscle or'
+                               ' tampered with?')
 
     def __store_snapshot(self, snapshot: Snapshot) -> Path:
         """Store a snapshot on the filesystem
diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
index 972e409b..7dbce076 100644
--- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py
+++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
@@ -87,7 +87,8 @@ def test_save_load_checkpoint(tmp_path: Path) -> None:
 
     assert not snapshot_manager2.should_save_snapshot(0.4)
     assert snapshot_manager2.should_save_final_snapshot(True, 1.2)
-    snapshot_manager2.save_final_snapshot(Message(0.6, None, 'test data2'), 1.2)
+    snapshot_manager2.save_final_snapshot(
+            Message(0.6, None, 'test data2'), 1.2, True)
 
     instance, metadata = manager.submit_snapshot_metadata.call_args[0]
     assert instance == instance_id

From f2b022eb534511883244130e1b748794d1da5c39 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 21 Nov 2022 13:49:39 +0100
Subject: [PATCH 087/183] Add snapshot type (final/interm.) in resume ymmsl

---
 .../python/libmuscle/manager/snapshot_registry.py      | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py
index 6bdad7fa..d0b75206 100644
--- a/libmuscle/python/libmuscle/manager/snapshot_registry.py
+++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py
@@ -419,15 +419,17 @@ def _generate_description(
             component_info.append((
                     str(node.instance),
                     f'{node.snapshot.timestamp:<11.6g}',
-                    f'{node.snapshot.wallclock_time:<11.6g}'))
+                    f'{node.snapshot.wallclock_time:<11.6g}',
+                    ("Intermediate", "Final")[node.snapshot.is_final_snapshot]))
             max_instance_len = max(max_instance_len, len(str(node.instance)))
         instance_with_padding = 'Instance'.ljust(max_instance_len)
         component_table = [
-                f'{instance_with_padding} t           wallclock time',
-                f'{"-" * (max_instance_len + 27)}']
+                f'{instance_with_padding} t           Wallclock time  Type',
+                f'{"-" * (max_instance_len + 41)}']
         component_table += [
                 f'{name.ljust(max_instance_len)} {timestamp} {walltime}'
-                for name, timestamp, walltime in component_info]
+                f'     {typ}'
+                for name, timestamp, walltime, typ in component_info]
         return (f'Workflow snapshot for {self._model.name}'
                 f' taken on {now.strftime("%Y-%m-%d %H:%M:%S")}.\n'
                 'Snapshot triggers:\n' +

From e881d20c2021361bba7857f18d5cd5747dfae133 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 21 Nov 2022 13:50:08 +0100
Subject: [PATCH 088/183] Add command line tool to display snapshot info

---
 muscle3/muscle3.py | 57 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 50 insertions(+), 7 deletions(-)

diff --git a/muscle3/muscle3.py b/muscle3/muscle3.py
index 6682215f..484e4335 100644
--- a/muscle3/muscle3.py
+++ b/muscle3/muscle3.py
@@ -1,12 +1,16 @@
 import sys
+from collections import OrderedDict
+from pathlib import Path
 from typing import Sequence
 
 import click
 import ymmsl
-from ymmsl import Identifier, PartialConfiguration
+from ymmsl import PartialConfiguration
 
 
-from libmuscle.planner.planner import Planner, Resources
+from libmuscle.planner.planner import (
+        Planner, Resources, InsufficientResourcesAvailable)
+from libmuscle.snapshot_manager import SnapshotManager
 
 
 _RESOURCES_INCOMPLETE_MODEL = """
@@ -17,13 +21,11 @@
 
 
 @click.group()
-def muscle3():
+def muscle3() -> None:
     """MUSCLE3 command line interface
 
-    In the future, this command will provide various functions for
-    running coupled simulations using MUSCLE3. For now, it does only
-    one thing, which is to calculate the number of cluster nodes
-    needed for a given simulation to run without oversubscribing.
+    This command provides various functions for running coupled simulations
+    using MUSCLE3.
 
     Use muscle3 <command> --help for help with individual commands.
     """
@@ -107,6 +109,47 @@ def resources(
     sys.exit(0)
 
 
+@muscle3.command(short_help='Display details of a stored snapshot')
+@click.argument(
+        'snapshot_files', nargs=-1, required=True, type=click.Path(
+            exists=True, file_okay=True, dir_okay=False, readable=True,
+            allow_dash=True, resolve_path=True, path_type=Path))
+@click.option(
+        '-d', '--data', is_flag=True,
+        help='Display stored data. Note this may result in a lot of output!')
+@click.option(
+        '-v', '--verbose', is_flag=True, help='Display more metadata.')
+def snapshot(
+        snapshot_files: Sequence[Path], data: bool, verbose: bool) -> None:
+    """Display information about stored snapshots.
+
+    Per provided snapshot, display metadata. Stored data can also be output by
+    supplying the '-d' or '--data' flags. Note that this may result in a lot of
+    data displayed.
+    """
+    for file in snapshot_files:
+        snapshot = SnapshotManager.load_snapshot_from_file(file)
+        click.echo(f'Snapshot at {file}:')
+        typ = 'Final' if snapshot.is_final_snapshot else 'Intermediate'
+        properties = OrderedDict([
+            ('Snapshot type', typ),
+            ('Snapshot timestamp', snapshot.message.timestamp),
+            ('Snapshot wallclock time', snapshot.wallclock_time),
+            ('Snapshot triggers', snapshot.triggers),
+        ])
+        if verbose:
+            properties.update([
+                ('Internal: Port message counts', snapshot.port_message_counts),
+            ])
+        for prop_name, prop_value in properties.items():
+            click.secho(f'{prop_name}: ', nl=False, bold=True)
+            click.echo(prop_value)
+        if data:
+            click.secho('Snapshot data:', bold=True)
+            click.echo(snapshot.message.data)
+        click.echo()
+
+
 def _load_ymmsl_files(ymmsl_files: Sequence[str]) -> PartialConfiguration:
     """Loads and merges yMMSL files."""
     configuration = PartialConfiguration()

From 6b92e5bc67ce4286158182e35b60dae12a5fd577 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 21 Nov 2022 14:42:43 +0100
Subject: [PATCH 089/183] Set TCP options in Python

Set TCP_NODELAY and TCP_QUICKACK (like the C++ code already did)
---
 libmuscle/python/libmuscle/mcp/tcp_transport_client.py | 4 ++++
 libmuscle/python/libmuscle/mcp/tcp_transport_server.py | 5 +++++
 2 files changed, 9 insertions(+)

diff --git a/libmuscle/python/libmuscle/mcp/tcp_transport_client.py b/libmuscle/python/libmuscle/mcp/tcp_transport_client.py
index cd976cae..88e68510 100644
--- a/libmuscle/python/libmuscle/mcp/tcp_transport_client.py
+++ b/libmuscle/python/libmuscle/mcp/tcp_transport_client.py
@@ -43,6 +43,10 @@ def __init__(self, location: str) -> None:
             raise RuntimeError('Could not connect to the server at location'
                                ' {}'.format(location))
         else:
+            if hasattr(socket, "TCP_NODELAY"):
+                sock.setsockopt(socket.SOL_TCP, socket.TCP_NODELAY, 1)
+            if hasattr(socket, "TCP_QUICKACK"):
+                sock.setsockopt(socket.SOL_TCP, socket.TCP_QUICKACK, 1)
             self._socket = sock
 
     def call(self, request: bytes) -> bytes:
diff --git a/libmuscle/python/libmuscle/mcp/tcp_transport_server.py b/libmuscle/python/libmuscle/mcp/tcp_transport_server.py
index 2219cd76..17831064 100644
--- a/libmuscle/python/libmuscle/mcp/tcp_transport_server.py
+++ b/libmuscle/python/libmuscle/mcp/tcp_transport_server.py
@@ -1,3 +1,4 @@
+import socket
 import socketserver as ss
 import threading
 from typing import cast, List, Optional, Tuple
@@ -19,6 +20,10 @@ def __init__(self, host_port_tuple: Tuple[str, int],
                  ) -> None:
         super().__init__(host_port_tuple, streamhandler)
         self.transport_server = transport_server
+        if hasattr(socket, "TCP_NODELAY"):
+            self.socket.setsockopt(socket.SOL_TCP, socket.TCP_NODELAY, 1)
+        if hasattr(socket, "TCP_QUICKACK"):
+            self.socket.setsockopt(socket.SOL_TCP, socket.TCP_QUICKACK, 1)
 
 
 class TcpHandler(ss.BaseRequestHandler):

From 51ed169fcee6fbae30e4048e708da8c0966dc103 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 22 Nov 2022 16:04:36 +0100
Subject: [PATCH 090/183] Rewrite macro/micro snapshot integration test

- Use new paradigm for running actors in integration test
- Update checks for changed mechanism
---
 integration_test/test_snapshot_macro_micro.py | 96 ++++++-------------
 1 file changed, 28 insertions(+), 68 deletions(-)

diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py
index ae657b5b..0de8fc47 100644
--- a/integration_test/test_snapshot_macro_micro.py
+++ b/integration_test/test_snapshot_macro_micro.py
@@ -1,10 +1,9 @@
-import sys
+from .conftest import run_manager_with_actors
 
 import pytest
-from ymmsl import Operator, load
+from ymmsl import Operator, load, dump
 
 from libmuscle import Instance, Message
-from libmuscle.manager.manager import Manager
 from libmuscle.manager.run_dir import RunDir
 
 
@@ -46,7 +45,7 @@ def macro():
             if instance.should_save_snapshot(t_cur):
                 instance.save_snapshot(Message(t_cur, None, i))
 
-        if instance.should_save_final_snapshot(t_cur):
+        if instance.should_save_final_snapshot():
             instance.save_final_snapshot(Message(t_cur, None, i))
 
 
@@ -136,42 +135,23 @@ def base_config():
     macro.o_i: micro.f_i
     micro.o_f: macro.s
 settings:
-  macro.t0: 0.12
+  macro.t0: 0.14
   macro.dt: 0.17
   macro.t_max: 1.9
   micro.dt: 0.009
   micro.t_max: 0.1
   muscle_remote_log_level: {_LOG_LEVEL}
-implementations:
-  macro_implementation:
-    executable: {sys.executable}
-    args:
-    - {__file__}
-    - macro
-    supports_checkpoint: true
-  micro_implementation:
-    executable: {sys.executable}
-    args:
-    - {__file__}
-    - micro
-    supports_checkpoint: true
-resources:
-  macro:
-    threads: 1
-  micro:
-    threads: 1
 checkpoints:
+  at_end: true
   simulation_time:
   - every: 0.4""")
 
 
-@pytest.mark.skip("To be updated")
 def test_snapshot_macro_micro(tmp_path, base_config):
-    base_config.check_consistent()
     run_dir1 = RunDir(tmp_path / 'run1')
-    manager = Manager(base_config, run_dir1, _LOG_LEVEL)
-    manager.start_instances()
-    assert manager.wait()
+    run_manager_with_actors(
+            dump(base_config), run_dir1.path,
+            python_actors={'macro': macro, 'micro': micro})
 
     # Note: sorted only works because we have fewer than 10 snapshots, otherwise
     # _10 would be sorted right after _1
@@ -183,74 +163,54 @@ def test_snapshot_macro_micro(tmp_path, base_config):
     snapshot_docs = list(map(load, snapshots_ymmsl))
     assert snapshot_docs[0].resume['macro'] == macro_snapshots[0]
     assert snapshot_docs[0].resume['micro'] == micro_snapshots[0]
-    assert snapshot_docs[1].resume['macro'] == macro_snapshots[1]
-    assert snapshot_docs[1].resume['micro'] == micro_snapshots[0]
+    assert snapshot_docs[1].resume['macro'] == macro_snapshots[0]
+    assert snapshot_docs[1].resume['micro'] == micro_snapshots[1]
     for i in range(2, 7):
         assert snapshot_docs[i].resume['macro'] == macro_snapshots[i - 1]
         assert snapshot_docs[i].resume['micro'] == micro_snapshots[i - 1]
 
-    base_config.update(snapshot_docs[4])
-    del base_config.settings['muscle_snapshot_directory']
-    base_config.check_consistent()
-
     run_dir2 = RunDir(tmp_path / 'run2')
-    manager = Manager(base_config, run_dir2, _LOG_LEVEL)
-    manager.start_instances()
-    assert manager.wait()
+    base_config.update(snapshot_docs[4])  # concatenate resume info
+    run_manager_with_actors(
+            dump(base_config), run_dir2.path,
+            python_actors={'macro': macro, 'micro': micro})
 
     macro_snapshots = sorted(run_dir2.snapshot_dir().glob('macro*'))
     assert len(macro_snapshots) == 2  # 1.6, final
     micro_snapshots = sorted(run_dir2.snapshot_dir().glob('micro*'))
-    assert len(micro_snapshots) == 3  # 1.2, 1.6, final
+    assert len(micro_snapshots) == 2  # 1.6, final
     snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl'))
     assert len(snapshots_ymmsl) == 2
 
 
-@pytest.mark.skip("To be updated")
 def test_snapshot_macro_vector_micro(tmp_path, base_config):
-    macro_implementation = base_config.implementations['macro_implementation']
-    macro_implementation.args[-1] = 'macro_vector'
     base_config.model.components[1].multiplicity = [2]
-    base_config.check_consistent()
 
     run_dir1 = RunDir(tmp_path / 'run1')
-    manager = Manager(base_config, run_dir1, _LOG_LEVEL)
-    manager.start_instances()
-    assert manager.wait()
+    run_manager_with_actors(
+            dump(base_config), run_dir1.path,
+            python_actors={'macro': macro_vector,
+                           'micro[0]': micro,
+                           'micro[1]': micro})
 
     macro_snapshots = sorted(run_dir1.snapshot_dir().glob('macro*'))
     assert len(macro_snapshots) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
     micro_snapshots = sorted(run_dir1.snapshot_dir().glob('micro*'))
     assert len(micro_snapshots) == 6 * 2  # 0, 0.4, 0.8, 1.2, 1.6, final
     snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl'))
-    # iff micro[0] snapshots before micro[1] at t==0.4, an additional workflow
-    # snapshot can be created
-    assert len(snapshots_ymmsl) in (7, 8)
-
-    snapshot_docs = list(map(load, sorted(snapshots_ymmsl)))
-    base_config.update(snapshot_docs[-3])
-    del base_config.settings['muscle_snapshot_directory']
-    base_config.check_consistent()
+    assert len(snapshots_ymmsl) == 8
 
     run_dir2 = RunDir(tmp_path / 'run2')
-    manager = Manager(base_config, run_dir2, _LOG_LEVEL)
-    manager.start_instances()
-    assert manager.wait()
+    base_config.update(load(snapshots_ymmsl[-3]))  # concatenate resume info
+    run_manager_with_actors(
+            dump(base_config), run_dir2.path,
+            python_actors={'macro': macro_vector,
+                           'micro[0]': micro,
+                           'micro[1]': micro})
 
     macro_snapshots = sorted(run_dir2.snapshot_dir().glob('macro*'))
     assert len(macro_snapshots) == 2  # 1.6, final
     micro_snapshots = sorted(run_dir2.snapshot_dir().glob('micro*'))
-    assert len(micro_snapshots) == 3 * 2  # 1.2, 1.6, final
+    assert len(micro_snapshots) == 2 * 2  # 1.6, final
     snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl'))
     assert len(snapshots_ymmsl) == 2
-
-
-if __name__ == "__main__":
-    if 'macro' in sys.argv:
-        macro()
-    elif 'macro_vector' in sys.argv:
-        macro_vector()
-    elif 'micro' in sys.argv:
-        micro()
-    else:
-        raise RuntimeError('Specify macro or micro on the command line')

From 76e7adf9f1edf73624ebc781864f31a4af0b6edc Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Wed, 23 Nov 2022 10:32:28 +0100
Subject: [PATCH 091/183] Add implicit checkpoint for restarting an instance

---
 integration_test/test_snapshot_macro_micro.py | 30 +++++++++++++------
 .../libmuscle/manager/snapshot_registry.py    | 18 ++++++++---
 2 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py
index 0de8fc47..9f769944 100644
--- a/integration_test/test_snapshot_macro_micro.py
+++ b/integration_test/test_snapshot_macro_micro.py
@@ -161,16 +161,19 @@ def test_snapshot_macro_micro(tmp_path, base_config):
     assert len(micro_snapshots) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
     snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl'))
     snapshot_docs = list(map(load, snapshots_ymmsl))
-    assert snapshot_docs[0].resume['macro'] == macro_snapshots[0]
+    assert 'macro' not in snapshot_docs[0].resume
     assert snapshot_docs[0].resume['micro'] == micro_snapshots[0]
     assert snapshot_docs[1].resume['macro'] == macro_snapshots[0]
-    assert snapshot_docs[1].resume['micro'] == micro_snapshots[1]
-    for i in range(2, 7):
-        assert snapshot_docs[i].resume['macro'] == macro_snapshots[i - 1]
-        assert snapshot_docs[i].resume['micro'] == micro_snapshots[i - 1]
-
+    assert snapshot_docs[1].resume['micro'] == micro_snapshots[0]
+    assert snapshot_docs[2].resume['macro'] == macro_snapshots[0]
+    assert snapshot_docs[2].resume['micro'] == micro_snapshots[1]
+    for i in range(3, 8):
+        assert snapshot_docs[i].resume['macro'] == macro_snapshots[i - 2]
+        assert snapshot_docs[i].resume['micro'] == micro_snapshots[i - 2]
+
+    # resume from the snapshots taken at t>=1.2
     run_dir2 = RunDir(tmp_path / 'run2')
-    base_config.update(snapshot_docs[4])  # concatenate resume info
+    base_config.update(snapshot_docs[5])  # add resume info
     run_manager_with_actors(
             dump(base_config), run_dir2.path,
             python_actors={'macro': macro, 'micro': micro})
@@ -182,6 +185,15 @@ def test_snapshot_macro_micro(tmp_path, base_config):
     snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl'))
     assert len(snapshots_ymmsl) == 2
 
+    # resume from the first workflow snapshot (this restarts macro from scratch)
+    run_dir3 = RunDir(tmp_path / 'run3')
+    base_config.resume = {}                     # clear resume information
+    base_config.update(snapshot_docs[0])        # add resume info
+    base_config.settings['macro.t_max'] = 0.6   # run shorter
+    run_manager_with_actors(
+            dump(base_config), run_dir3.path,
+            python_actors={'macro': macro, 'micro': micro})
+
 
 def test_snapshot_macro_vector_micro(tmp_path, base_config):
     base_config.model.components[1].multiplicity = [2]
@@ -198,10 +210,10 @@ def test_snapshot_macro_vector_micro(tmp_path, base_config):
     micro_snapshots = sorted(run_dir1.snapshot_dir().glob('micro*'))
     assert len(micro_snapshots) == 6 * 2  # 0, 0.4, 0.8, 1.2, 1.6, final
     snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl'))
-    assert len(snapshots_ymmsl) == 8
+    assert len(snapshots_ymmsl) == 10
 
     run_dir2 = RunDir(tmp_path / 'run2')
-    base_config.update(load(snapshots_ymmsl[-3]))  # concatenate resume info
+    base_config.update(load(snapshots_ymmsl[-3]))  # add resume info
     run_manager_with_actors(
             dump(base_config), run_dir2.path,
             python_actors={'macro': macro_vector,
diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py
index d0b75206..cbb8bbde 100644
--- a/libmuscle/python/libmuscle/manager/snapshot_registry.py
+++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py
@@ -194,7 +194,12 @@ def __init__(
         self._instances = set()             # type: Set[Reference]
         for component in config.model.components:
             self._instances.update(component.instances())
-        # TODO: create snapshot nodes for starting from scratch
+
+        # Create snapshot nodes for starting from scratch
+        self._null_snapshot = SnapshotMetadata(
+                ["Instance start"], 0, 0, None, {}, True, '')
+        for instance in self._instances:
+            self.register_snapshot(instance, self._null_snapshot)
 
     def register_snapshot(
             self, instance: Reference, snapshot: SnapshotMetadata) -> None:
@@ -243,7 +248,8 @@ def _add_snapshot(
                         peer_snapshot, self._get_connections(instance, peer))
 
         # finally, check if this snapshotnode is now part of a workflow snapshot
-        self._save_workflow_snapshot(snapshotnode)
+        if snapshot is not self._null_snapshot:
+            self._save_workflow_snapshot(snapshotnode)
 
     def _save_workflow_snapshot(self, snapshotnode: SnapshotNode) -> None:
         """Save snapshot if a workflow snapshot exists with the provided node.
@@ -402,7 +408,11 @@ def _generate_snapshot_config(
         selected_snapshots.sort(key=attrgetter('instance'))
         resume = {}
         for node in selected_snapshots:
-            resume[node.instance] = Path(node.snapshot.snapshot_filename)
+            if node.snapshot is not self._null_snapshot:
+                # Only store resume information when it is an actual snapshot
+                # created by the instance. Otherwise the instance can just be
+                # restarted from the beginning.
+                resume[node.instance] = Path(node.snapshot.snapshot_filename)
         description = self._generate_description(selected_snapshots, now)
         return PartialConfiguration(resume=resume, description=description)
 
@@ -436,7 +446,7 @@ def _generate_description(
                 '\n'.join(f'- {trigger} ({", ".join(triggers[trigger])})'
                           for trigger in sorted(triggers)) +
                 '\n\n' +
-                '\n'.join(component_table))
+                '\n'.join(component_table) + '\n')
 
     def _cleanup_snapshots(
             self, workflow_snapshots: List[List[SnapshotNode]]) -> None:

From 775834e27c7a6304a18de4261626b88e6d20d4da Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 24 Nov 2022 13:46:36 +0100
Subject: [PATCH 092/183] Implicit checkpointing for stateless instances

---
 integration_test/test_snapshot_macro_micro.py | 55 ++++++++++++-
 .../python/libmuscle/checkpoint_triggers.py   |  7 ++
 libmuscle/python/libmuscle/instance.py        | 35 ++++++---
 libmuscle/python/libmuscle/snapshot.py        | 14 ++--
 .../python/libmuscle/snapshot_manager.py      | 77 +++++++++++++------
 .../python/libmuscle/test/test_snapshot.py    | 12 +++
 .../libmuscle/test/test_snapshot_manager.py   | 63 ++++++++++++---
 7 files changed, 212 insertions(+), 51 deletions(-)

diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py
index 9f769944..f9c14103 100644
--- a/integration_test/test_snapshot_macro_micro.py
+++ b/integration_test/test_snapshot_macro_micro.py
@@ -1,7 +1,7 @@
 from .conftest import run_manager_with_actors
 
 import pytest
-from ymmsl import Operator, load, dump
+from ymmsl import ImplementationState, Operator, load, dump
 
 from libmuscle import Instance, Message
 from libmuscle.manager.run_dir import RunDir
@@ -123,6 +123,28 @@ def micro():
             instance.save_final_snapshot(Message(t_cur, None, [i, t_stop]))
 
 
+def stateless_micro():
+    instance = Instance({
+            Operator.F_INIT: ['f_i'],
+            Operator.O_F: ['o_f']},
+            stateful=ImplementationState.STATELESS)
+
+    while instance.reuse_instance():
+        dt = instance.get_setting('dt', 'float')
+        t_max = instance.get_setting('t_max', 'float')
+
+        msg = instance.receive('f_i')
+        t_cur = msg.timestamp
+        i = msg.data
+        t_stop = t_cur + t_max
+
+        while t_cur < t_stop:
+            # faux time-integration for testing snapshots
+            t_cur += dt
+
+        instance.send('o_f', Message(t_cur, None, i))
+
+
 @pytest.fixture
 def base_config():
     return load(f"""ymmsl_version: v0.1
@@ -195,6 +217,37 @@ def test_snapshot_macro_micro(tmp_path, base_config):
             python_actors={'macro': macro, 'micro': micro})
 
 
+def test_snapshot_macro_stateless_micro(tmp_path, base_config):
+    run_dir1 = RunDir(tmp_path / 'run1')
+    run_manager_with_actors(
+            dump(base_config), run_dir1.path,
+            python_actors={'macro': macro, 'micro': stateless_micro})
+
+    # Note: sorted only works because we have fewer than 10 snapshots, otherwise
+    # _10 would be sorted right after _1
+    macro_snapshots = sorted(run_dir1.snapshot_dir().glob('macro*'))
+    assert len(macro_snapshots) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
+    micro_snapshots = sorted(run_dir1.snapshot_dir().glob('micro*'))
+    assert len(micro_snapshots) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
+    snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl'))
+    snapshot_docs = list(map(load, snapshots_ymmsl))
+    assert len(snapshot_docs) == 6
+
+    # resume from the snapshot taken at t>=1.2
+    run_dir2 = RunDir(tmp_path / 'run2')
+    base_config.update(snapshot_docs[3])  # add resume info
+    run_manager_with_actors(
+            dump(base_config), run_dir2.path,
+            python_actors={'macro': macro, 'micro': stateless_micro})
+
+    macro_snapshots = sorted(run_dir2.snapshot_dir().glob('macro*'))
+    assert len(macro_snapshots) == 2  # 1.6, final
+    micro_snapshots = sorted(run_dir2.snapshot_dir().glob('micro*'))
+    assert len(micro_snapshots) == 3  # 1.2, 1.6, final
+    snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl'))
+    assert len(snapshots_ymmsl) == 2
+
+
 def test_snapshot_macro_vector_micro(tmp_path, base_config):
     base_config.model.components[1].multiplicity = [2]
 
diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py
index 57e26ca7..a33a785d 100644
--- a/libmuscle/python/libmuscle/checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/checkpoint_triggers.py
@@ -273,6 +273,13 @@ def should_save_final_snapshot(
         self._should_save_final_called = True
         return value
 
+    @property
+    def save_final_snapshot_called(self) -> bool:
+        """Check if :meth:`save_final_snapshot` was called during this
+        reuse loop.
+        """
+        return self._saved_final_checkpoint
+
     def reuse_instance(self) -> None:
         """Cleanup between instance reuse
         """
diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index 379067c8..f58e146f 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -8,7 +8,7 @@
 from typing_extensions import Literal
 
 from ymmsl import (Identifier, Operator, SettingValue, Port, Reference,
-                   Settings)
+                   Settings, ImplementationState)
 
 from libmuscle.communicator import Communicator, Message
 from libmuscle.settings_manager import SettingsManager
@@ -34,7 +34,8 @@ class Instance:
     This class provides a low-level send/receive API for the instance
     to use.
     """
-    def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None
+    def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None,
+                 stateful: ImplementationState = ImplementationState.STATEFUL
                  ) -> None:
         """Create an Instance.
 
@@ -44,6 +45,14 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None
         """
         self.__is_shut_down = False
 
+        if not isinstance(stateful, ImplementationState):
+            raise ValueError(
+                    f'Invalid value supplied for "stateful": {stateful}.'
+                    ' Expected one of ImplementationState.STATEFUL,'
+                    ' ImplementationState.STATELESS or ImplementationState.'
+                    'WEAKLY_STATEFUL.')
+        self._stateful = stateful
+
         # Note that these are accessed by Muscle3, but otherwise private.
         self._name, self._index = self.__make_full_name()
         """Name and index of this instance."""
@@ -68,7 +77,8 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None
         """Settings for this instance."""
 
         self._snapshot_manager = SnapshotManager(
-                self._instance_name(), self.__manager, self._communicator)
+                self._instance_name(), self.__manager, self._communicator,
+                self._stateful)
         """Keeps track of checkpointing and snapshots"""
 
         self._first_run = True
@@ -144,7 +154,8 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool:
             snapshot_path = Path(snapshot_dir)
         except KeyError:
             snapshot_path = None
-        self._snapshot_manager.reuse_instance(snapshot_path)
+        self._snapshot_manager.reuse_instance(
+                snapshot_path, do_reuse, self.__f_init_max_timestamp)
 
         if not do_reuse:
             self.__close_ports()
@@ -539,11 +550,8 @@ def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool:
                     'You may not call should_save_final_snapshot more than once'
                     ' per reuse loop.')
         self._do_reuse = self.__check_reuse_instance(apply_overlay)
-        f_init_max_timestamp = max(
-                (msg.timestamp for msg in self._f_init_cache.values()),
-                default=None)
         return self._snapshot_manager.should_save_final_snapshot(
-                self._do_reuse, f_init_max_timestamp)
+                self._do_reuse, self.__f_init_max_timestamp)
 
     def save_final_snapshot(self, message: Message) -> None:
         """Save a snapshot before O_F.
@@ -566,11 +574,16 @@ def save_final_snapshot(self, message: Message) -> None:
                 attribute can be used to store the internal state of the
                 submodel.
         """
-        f_init_max_timestamp = max(
+        return self._snapshot_manager.save_final_snapshot(
+                message, self.__f_init_max_timestamp)
+
+    @property
+    def __f_init_max_timestamp(self) -> Optional[float]:
+        """Return max timestamp of pre-received F_INIT messages
+        """
+        return max(
                 (msg.timestamp for msg in self._f_init_cache.values()),
                 default=None)
-        return self._snapshot_manager.save_final_snapshot(
-                message, f_init_max_timestamp, self._do_reuse)
 
     def _register(self) -> None:
         """Register this instance with the manager.
diff --git a/libmuscle/python/libmuscle/snapshot.py b/libmuscle/python/libmuscle/snapshot.py
index 93ed9307..633d3f3d 100644
--- a/libmuscle/python/libmuscle/snapshot.py
+++ b/libmuscle/python/libmuscle/snapshot.py
@@ -21,7 +21,7 @@ def __init__(self,
                  wallclock_time: float,
                  port_message_counts: Dict[str, List[int]],
                  is_final_snapshot: bool,
-                 message: 'communicator.Message') -> None:
+                 message: Optional['communicator.Message']) -> None:
         self.triggers = triggers
         self.wallclock_time = wallclock_time
         self.port_message_counts = port_message_counts
@@ -74,9 +74,11 @@ def to_bytes(self) -> bytes:
         }))
 
     @staticmethod
-    def message_to_bytes(message: 'communicator.Message') -> bytes:
+    def message_to_bytes(message: Optional['communicator.Message']) -> bytes:
         """Use MPPMessage serializer for serializing the message object
         """
+        if message is None:
+            return b''
         settings = Settings()
         if message.settings is not None:
             settings = message.settings
@@ -85,9 +87,11 @@ def message_to_bytes(message: 'communicator.Message') -> bytes:
                           settings, 0, message.data).encoded()
 
     @staticmethod
-    def bytes_to_message(data: bytes) -> 'communicator.Message':
+    def bytes_to_message(data: bytes) -> Optional['communicator.Message']:
         """Use MPPMessage deserializer for serializing the message object
         """
+        if not data:
+            return None
         mpp_message = MPPMessage.from_bytes(data)
         return communicator.Message(mpp_message.timestamp,
                                     mpp_message.next_timestamp,
@@ -116,8 +120,8 @@ def from_snapshot(snapshot: Snapshot, snapshot_filename: str
         return SnapshotMetadata(
             snapshot.triggers,
             snapshot.wallclock_time,
-            snapshot.message.timestamp,
-            snapshot.message.next_timestamp,
+            snapshot.message.timestamp if snapshot.message else float('NaN'),
+            snapshot.message.next_timestamp if snapshot.message else None,
             snapshot.port_message_counts,
             snapshot.is_final_snapshot,
             snapshot_filename
diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py
index 9b4c2da3..54059375 100644
--- a/libmuscle/python/libmuscle/snapshot_manager.py
+++ b/libmuscle/python/libmuscle/snapshot_manager.py
@@ -1,9 +1,9 @@
 import logging
 from datetime import datetime
 from pathlib import Path
-from typing import Optional
+from typing import cast, Optional
 
-from ymmsl import Checkpoints, Reference, Operator
+from ymmsl import Checkpoints, Reference, Operator, ImplementationState
 
 from libmuscle.checkpoint_triggers import TriggerManager
 from libmuscle.communicator import Communicator, Message
@@ -14,6 +14,11 @@
 
 _MAX_FILE_EXISTS_CHECK = 10000
 
+# error text for save_snapshot when msg = None
+_NO_MESSAGE_PROVIDED = (
+        'Invalid message provided to `{}`. Please create a Message object to'
+        ' store the state of the instance in a snapshot.')
+
 
 class SnapshotManager:
     """Manages information on snapshots for the Instance
@@ -25,7 +30,8 @@ class SnapshotManager:
     def __init__(self,
                  instance_id: Reference,
                  manager: MMPClient,
-                 communicator: Communicator) -> None:
+                 communicator: Communicator,
+                 stateful: ImplementationState) -> None:
         """Create a new snapshot manager
 
         Args:
@@ -39,6 +45,7 @@ def __init__(self,
         self._safe_id = str(instance_id).replace("[", "-").replace("]", "")
         self._communicator = communicator
         self._manager = manager
+        self._stateful = stateful
 
         self._first_reuse = True
         self._trigger_manager = TriggerManager()
@@ -66,19 +73,37 @@ def _set_checkpoint_info(self,
         self._trigger_manager.set_checkpoint_info(utc_reference, checkpoints)
         if resume is not None:
             snapshot = self.load_snapshot_from_file(resume)
-            self._resume_from_snapshot = snapshot
+            if snapshot.message is not None:
+                # snapshot.message is None for implicit snapshots
+                self._resume_from_snapshot = snapshot
+                self._trigger_manager.update_checkpoints(
+                    snapshot.message.timestamp,
+                    snapshot.is_final_snapshot)
             self._communicator.restore_message_counts(
                 snapshot.port_message_counts)
-            self._trigger_manager.update_checkpoints(
-                snapshot.message.timestamp,
-                snapshot.is_final_snapshot)
 
-    def reuse_instance(self, snapshot_directory: Optional[Path]) -> None:
+    def reuse_instance(self, snapshot_directory: Optional[Path],
+                       do_reuse: bool, f_init_max_timestamp: Optional[float]
+                       ) -> None:
         """Callback on Instance.reuse_instance
 
         Args:
             snapshot_directory: Path to store this instance's snapshots in.
+            do_reuse: Used for implicit snapshots of stateless instances. See
+                :meth:`should_save_final_snapshot`.
+            f_init_max_timestamp: Used for implicit snapshots of stateless
+                instances. See :meth:`should_save_final_snapshot`.
         """
+        # Implicit snapshots for stateless / weakly stateful instances
+        # Only create implicit snapshot if not already explicitly done
+        # And not in the first reuse_instance()
+        if (self._stateful is not ImplementationState.STATEFUL and
+                not self._trigger_manager.save_final_snapshot_called and
+                not self._first_reuse):
+            if self.should_save_final_snapshot(do_reuse, f_init_max_timestamp):
+                # create an empty message object to store
+                self.__save_snapshot(None, True, f_init_max_timestamp)
+
         self._trigger_manager.reuse_instance()
 
         self._snapshot_directory = snapshot_directory
@@ -110,22 +135,22 @@ def should_init(self) -> bool:
                 self._resume_from_snapshot.is_final_snapshot)
 
     def load_snapshot(self) -> Message:
-        """Get the Message to resume from
+        """Get the Message to resume from.
         """
         if self._resume_from_snapshot is None:
             raise RuntimeError('No snapshot to load. Use "instance.resuming()"'
                                ' to check if a snapshot is available')
-        return self._resume_from_snapshot.message
+        return cast(Message, self._resume_from_snapshot.message)
 
     def should_save_snapshot(self, timestamp: float) -> bool:
-        """See :meth:`TriggerManager.should_save_snapshot`
+        """See :meth:`TriggerManager.should_save_snapshot`.
         """
         return self._trigger_manager.should_save_snapshot(timestamp)
 
     def should_save_final_snapshot(
             self, do_reuse: bool, f_init_max_timestamp: Optional[float]
             ) -> bool:
-        """See :meth:`TriggerManager.should_save_final_snapshot`
+        """See :meth:`TriggerManager.should_save_final_snapshot`.
         """
         return self._trigger_manager.should_save_final_snapshot(
                 do_reuse, f_init_max_timestamp)
@@ -133,25 +158,27 @@ def should_save_final_snapshot(
     def save_snapshot(self, msg: Message) -> None:
         """Save snapshot contained in the message object.
         """
+        if not isinstance(msg, Message):
+            raise ValueError(_NO_MESSAGE_PROVIDED.format('save_snapshot'))
         self.__save_snapshot(msg, False)
 
     def save_final_snapshot(
-            self, msg: Message, f_init_max_timestamp: Optional[float],
-            do_reuse: Optional[bool]) -> None:
-        """Save final snapshot contained in the message object
+            self, msg: Message, f_init_max_timestamp: Optional[float]) -> None:
+        """Save final snapshot contained in the message object.
         """
-        self.__save_snapshot(msg, True, f_init_max_timestamp, do_reuse)
+        if not isinstance(msg, Message):
+            raise ValueError(_NO_MESSAGE_PROVIDED.format('save_final_snapshot'))
+        self.__save_snapshot(msg, True, f_init_max_timestamp)
 
     def __save_snapshot(
-            self, msg: Message, final: bool,
-            f_init_max_timestamp: Optional[float] = None,
-            do_reuse: Optional[bool] = None
+            self, msg: Optional[Message], final: bool,
+            f_init_max_timestamp: Optional[float] = None
             ) -> None:
         """Actual implementation used by save_(final_)snapshot.
 
         Args:
-            msg: message object representing the snapshot
-            final: True iff called from save_final_snapshot
+            msg: Message object representing the snapshot.
+            final: True iff called from save_final_snapshot.
         """
         triggers = self._trigger_manager.get_triggers()
         wallclock_time = self._trigger_manager.elapsed_walltime()
@@ -176,10 +203,10 @@ def __save_snapshot(
         metadata = SnapshotMetadata.from_snapshot(snapshot, str(path))
         self._manager.submit_snapshot_metadata(self._instance_id, metadata)
 
-        timestamp = msg.timestamp
+        timestamp = msg.timestamp if msg is not None else -1.0
         if final and f_init_max_timestamp is not None:
             # For final snapshots f_init_max_snapshot is the reference time (see
-            # should_save_Final_snapshot).
+            # should_save_final_snapshot).
             timestamp = f_init_max_timestamp
         self._trigger_manager.update_checkpoints(timestamp, final)
 
@@ -190,6 +217,7 @@ def load_snapshot_from_file(snapshot_location: Path) -> Snapshot:
         Args:
             snapshot_location: path where the snapshot is stored
         """
+        _logger.debug(f'Loading snapshot from {snapshot_location}')
         if not snapshot_location.is_file():
             raise RuntimeError(f'Unable to load snapshot: {snapshot_location}'
                                ' is not a file. Please ensure this path exists'
@@ -206,7 +234,7 @@ def load_snapshot_from_file(snapshot_location: Path) -> Snapshot:
                                f' {snapshot_location}: unknown version of'
                                ' snapshot file. Was the file saved with a'
                                ' different version of libmuscle or'
-                               ' tampered with?')
+                               ' edited?')
 
     def __store_snapshot(self, snapshot: Snapshot) -> Path:
         """Store a snapshot on the filesystem
@@ -217,6 +245,7 @@ def __store_snapshot(self, snapshot: Snapshot) -> Path:
         Returns:
             Path where the snapshot is stored
         """
+        _logger.debug(f'Saving snapshot to {self._snapshot_directory}')
         if self._snapshot_directory is None:
             raise RuntimeError('Unknown snapshot directory. Did you try to'
                                ' save a snapshot before entering the reuse'
diff --git a/libmuscle/python/libmuscle/test/test_snapshot.py b/libmuscle/python/libmuscle/test/test_snapshot.py
index c959a226..f459a001 100644
--- a/libmuscle/python/libmuscle/test/test_snapshot.py
+++ b/libmuscle/python/libmuscle/test/test_snapshot.py
@@ -61,3 +61,15 @@ def test_message_with_settings() -> None:
 
     snapshot2 = MsgPackSnapshot.from_bytes(binary_snapshot)
     assert snapshot2.message.settings.get('setting') is True
+
+
+def test_implicit_snapshot() -> None:
+    message = None
+    snapshot = MsgPackSnapshot([], 0, {}, True, message)
+    assert snapshot.message is None
+
+    binary_snapshot = snapshot.to_bytes()
+    assert isinstance(binary_snapshot, bytes)
+
+    snapshot2 = MsgPackSnapshot.from_bytes(binary_snapshot)
+    assert snapshot2.message is None
diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
index 7dbce076..16f81ce3 100644
--- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py
+++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
@@ -4,7 +4,8 @@
 from unittest.mock import MagicMock
 
 import pytest
-from ymmsl import Reference, Checkpoints, CheckpointRangeRule
+from ymmsl import (
+        Reference, Checkpoints, CheckpointRangeRule, ImplementationState)
 
 from libmuscle.communicator import Message
 from libmuscle.snapshot import SnapshotMetadata
@@ -16,13 +17,15 @@ def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path
     manager = MagicMock()
     communicator = MagicMock()
     communicator.get_message_counts.return_value = {}
-    snapshot_manager = SnapshotManager(Reference('test'), manager, communicator)
+    snapshot_manager = SnapshotManager(
+            Reference('test'), manager, communicator,
+            ImplementationState.STATEFUL)
 
     snapshot_manager._set_checkpoint_info(
             datetime.now(timezone.utc), Checkpoints(), None)
 
     assert not snapshot_manager.resuming()
-    snapshot_manager.reuse_instance(tmp_path)
+    snapshot_manager.reuse_instance(tmp_path, True, None)
     assert not snapshot_manager.resuming()
     assert not snapshot_manager.should_save_snapshot(1)
     assert not snapshot_manager.should_save_snapshot(5000)
@@ -34,21 +37,22 @@ def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path
         assert "no checkpoints" in caplog.records[0].message
 
 
-def test_save_load_checkpoint(tmp_path: Path) -> None:
+def test_save_load_snapshot(tmp_path: Path) -> None:
     manager = MagicMock()
     communicator = MagicMock()
     port_message_counts = {'in': [1], 'out': [2], 'muscle_settings_in': [0]}
     communicator.get_message_counts.return_value = port_message_counts
 
     instance_id = Reference('test[1]')
-    snapshot_manager = SnapshotManager(instance_id, manager, communicator)
+    snapshot_manager = SnapshotManager(
+            instance_id, manager, communicator, ImplementationState.STATEFUL)
 
     checkpoints = Checkpoints(simulation_time=[CheckpointRangeRule(every=1)])
     snapshot_manager._set_checkpoint_info(
             datetime.now(timezone.utc), checkpoints, None)
 
     assert not snapshot_manager.resuming()
-    snapshot_manager.reuse_instance(tmp_path)
+    snapshot_manager.reuse_instance(tmp_path, True, None)
     with pytest.raises(RuntimeError):
         snapshot_manager.load_snapshot()
 
@@ -71,14 +75,15 @@ def test_save_load_checkpoint(tmp_path: Path) -> None:
     assert snapshot_path.parent == tmp_path
     assert snapshot_path.name == 'test-1_1.pack'
 
-    snapshot_manager2 = SnapshotManager(instance_id, manager, communicator)
+    snapshot_manager2 = SnapshotManager(
+            instance_id, manager, communicator, ImplementationState.STATEFUL)
 
     snapshot_manager2._set_checkpoint_info(
             datetime.now(timezone.utc), checkpoints, snapshot_path)
     communicator.restore_message_counts.assert_called_with(port_message_counts)
 
     assert snapshot_manager2.resuming()
-    snapshot_manager2.reuse_instance(tmp_path)
+    snapshot_manager2.reuse_instance(tmp_path, True, None)
     assert snapshot_manager2.resuming()
     msg = snapshot_manager2.load_snapshot()
     assert msg.timestamp == 0.2
@@ -88,7 +93,7 @@ def test_save_load_checkpoint(tmp_path: Path) -> None:
     assert not snapshot_manager2.should_save_snapshot(0.4)
     assert snapshot_manager2.should_save_final_snapshot(True, 1.2)
     snapshot_manager2.save_final_snapshot(
-            Message(0.6, None, 'test data2'), 1.2, True)
+            Message(0.6, None, 'test data2'), 1.2)
 
     instance, metadata = manager.submit_snapshot_metadata.call_args[0]
     assert instance == instance_id
@@ -104,5 +109,43 @@ def test_save_load_checkpoint(tmp_path: Path) -> None:
     assert snapshot_path.name == 'test-1_2.pack'
 
     assert snapshot_manager2.resuming()
-    snapshot_manager2.reuse_instance(tmp_path)
+    snapshot_manager2.reuse_instance(tmp_path, True, None)
     assert not snapshot_manager2.resuming()
+
+
+def test_save_load_implicit_snapshot(tmp_path: Path) -> None:
+    manager = MagicMock()
+    communicator = MagicMock()
+    port_message_counts = {'in': [1], 'out': [2], 'muscle_settings_in': [0]}
+    communicator.get_message_counts.return_value = port_message_counts
+
+    instance_id = Reference('test[1]')
+    snapshot_manager = SnapshotManager(
+            instance_id, manager, communicator, ImplementationState.STATELESS)
+
+    checkpoints = Checkpoints(simulation_time=[CheckpointRangeRule(every=1)])
+    snapshot_manager._set_checkpoint_info(
+            datetime.now(timezone.utc), checkpoints, None)
+
+    assert not snapshot_manager.resuming()
+    snapshot_manager.reuse_instance(tmp_path, True, None)
+    snapshot_manager.reuse_instance(tmp_path, True, 1.5)
+    manager.submit_snapshot_metadata.assert_called_once()
+    instance, metadata = manager.submit_snapshot_metadata.call_args[0]
+    assert instance == instance_id
+    assert isinstance(metadata, SnapshotMetadata)
+    snapshot_path = Path(metadata.snapshot_filename)
+    manager.submit_snapshot_metadata.reset_mock()
+
+    snapshot_manager2 = SnapshotManager(
+            instance_id, manager, communicator, ImplementationState.STATELESS)
+
+    snapshot_manager2._set_checkpoint_info(
+            datetime.now(timezone.utc), checkpoints, snapshot_path)
+    communicator.restore_message_counts.assert_called_with(port_message_counts)
+
+    assert not snapshot_manager2.resuming()
+    snapshot_manager2.reuse_instance(tmp_path, True, 1.5)
+    assert not snapshot_manager2.resuming()
+    snapshot_manager2.reuse_instance(tmp_path, True, 2.5)
+    manager.submit_snapshot_metadata.assert_called_once()

From 2cfcaff780b9abac700a886f550d06ae8422d684 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 24 Nov 2022 17:01:00 +0100
Subject: [PATCH 093/183] Docs: add intersphinx and update cross-refs

---
 docs/source/conf.py                    | 10 +++++
 libmuscle/python/libmuscle/grid.py     | 24 +++++-----
 libmuscle/python/libmuscle/instance.py | 62 ++++++++++++++------------
 3 files changed, 57 insertions(+), 39 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 7464d1c0..5cef345e 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -47,6 +47,7 @@
         'breathe',
         'sphinx.ext.autodoc',
         'sphinx.ext.autosectionlabel',
+        'sphinx.ext.intersphinx',
         'sphinx.ext.napoleon',
         'sphinx.ext.todo',
         'sphinx.ext.viewcode',
@@ -109,6 +110,15 @@
 
 breathe_default_members = ('members',)
 
+# Configuration of sphinx.ext.intersphinx
+# See https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3/", None),
+    "numpy": ("https://numpy.org/doc/stable", None),
+    "ymmsl": ("https://ymmsl-python.readthedocs.io/en/stable", None),
+}
+
+
 # -- Patch version into installation instructions --
 def patch_installation_version():
     with open('installing.rst', 'w') as out_file:
diff --git a/libmuscle/python/libmuscle/grid.py b/libmuscle/python/libmuscle/grid.py
index 34efc806..bfe26715 100644
--- a/libmuscle/python/libmuscle/grid.py
+++ b/libmuscle/python/libmuscle/grid.py
@@ -6,13 +6,13 @@
 class Grid:
     """Represents a grid of data to send or receive.
 
-    Note that for received grids, the array of data is a read-only
-    NumPy array. If you have another array that you want to put the
-    received data into, use ``np.copyto(dest, source)`` to copy the
-    contents of the received array across into your destination array.
-    If you don't have an array yet and want a writable version of the
-    received array, use ``array.copy()`` to create a writable copy.
-    See the tutorial for examples.
+    Note that for received grids, the array of data is a read-only NumPy array.
+    If you have another array that you want to put the received data into, use
+    :external:py:func:`np.copyto(dest, source) <numpy.copyto>` to copy the
+    contents of the received array across into your destination array. If you
+    don't have an array yet and want a writable version of the received array,
+    use :external:py:meth:`array.copy()<numpy.ndarray.copy>` to create a
+    writable copy. See the tutorial for examples.
 
     Attributes:
         array (np.ndarray): An array of data
@@ -26,10 +26,12 @@ def __init__(
         A Grid object represents an multi-dimensional array of data. It
         has a type, a shape, and optionally a list of index names.
 
-        Supported data types are 4- and 8-byte integers (numpy.int32,
-        numpy.int64), 4- and 8-byte floats (numpy.float32,
-        numpy.float64), and booleans (np.bool_, np.bool8). The ``data``
-        argument must be a NumPy array of one of those types.
+        Supported data types are 4- and 8-byte integers
+        (:external:py:attr:`numpy.int32`, :external:py:attr:`numpy.int64`),
+        4- and 8-byte floats (:external:py:attr:`numpy.float32`,
+        :external:py:attr:`numpy.float64`), and booleans
+        (:external:py:class:`numpy.bool_`, :external:py:attr:`numpy.bool8`). The
+        ``data`` argument must be a NumPy array of one of those types.
 
         If ``indexes`` is given, then it must be a list of strings of
         the same length as the number of dimensions of ``data``, and
diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index f58e146f..03f3d494 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -40,8 +40,12 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None,
         """Create an Instance.
 
         Args:
-            ports: A list of port names for each operator of this
-                component.
+            ports: A list of port names for each
+                :external:py:class:`~ymmsl.Operator` of this component.
+            stateful: Indicate whether this instance carries state between
+                iterations of the reuse loop. See
+                :external:py:class:`ymmsl.ImplementationState` for a description
+                of the options.
         """
         self.__is_shut_down = False
 
@@ -124,7 +128,7 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool:
                 overlay or to save it. If you're going to use
                 :meth:`receive_with_settings` on your F_INIT ports,
                 set this to False. If you don't know what that means,
-                just call `reuse_instance()` without specifying this
+                just call :meth:`reuse_instance()` without specifying this
                 and everything will be fine. If it turns out that you
                 did need to specify False, MUSCLE3 will tell you about
                 it in an error message and you can add it still.
@@ -236,7 +240,7 @@ def get_setting(self, name: str, typ: Optional[str] = None
                 self._instance_name(), Reference(name), typ)
 
     def list_ports(self) -> Dict[Operator, List[str]]:
-        """Returns a description of the ports that this CE has.
+        """Returns a description of the ports that this Instance has.
 
         Note that the result has almost the same format as the port
         declarations you pass when making an Instance. The only
@@ -244,9 +248,9 @@ def list_ports(self) -> Dict[Operator, List[str]]:
         even if the port is a vector port.
 
         Returns:
-            A dictionary, indexed by Operator, containing lists of
-            port names. Operators with no associated ports are not
-            included.
+            A dictionary, indexed by :external:py:class:`~ymmsl.Operator`,
+            containing lists of port names. Operators with no associated ports
+            are not included.
         """
         return self._communicator.list_ports()
 
@@ -299,7 +303,8 @@ def get_port_length(self, port: str) -> int:
         Args:
             port: The name of the port to measure.
 
-        Raises: RuntimeError if this is a scalar port.
+        Raises:
+            RuntimeError: If this is a scalar port.
         """
         return self._communicator.get_port(port).get_length()
 
@@ -307,7 +312,7 @@ def set_port_length(self, port: str, length: int) -> None:
         """Resizes the port to the given length.
 
         You should check whether the port is resizable using
-        `is_resizable()` first; whether it is depends on how this
+        :meth:`is_resizable()` first; whether it is depends on how this
         component is wired up, so you should check.
 
         Args:
@@ -324,7 +329,7 @@ def send(self, port_name: str, message: Message,
         """Send a message to the outside world.
 
         Sending is non-blocking, a copy of the message will be made
-        and stored until the receiver is ready to receive it.
+        and stored in memory until the receiver is ready to receive it.
 
         Args:
             port_name: The port on which this message is to be sent.
@@ -410,8 +415,9 @@ def receive_with_settings(
     def snapshots_enabled(self) -> bool:
         """Check if the current workflow has snapshots enabled.
 
-        When snapshots are not enabled, all calls to should_save_snapshot and
-        should_save_final_snapshot will return False.
+        When snapshots are not enabled, all calls to
+        :meth:`should_save_snapshot` and :meth:`should_save_final_snapshot` will
+        return False.
 
         Returns:
             True iff checkpoint rules are defined in the workflow yMMSL.
@@ -446,7 +452,7 @@ def should_init(self) -> bool:
         before attempting to receive data on F_INIT ports.
 
         Returns:
-            True iff the submodel must skip the F_INIT step
+            True if the submodel must execute the F_INIT step, False otherwise.
         """
         return self._snapshot_manager.should_init()
 
@@ -465,7 +471,8 @@ def load_snapshot(self) -> Message:
         return self._snapshot_manager.load_snapshot()
 
     def should_save_snapshot(self, timestamp: float) -> bool:
-        """Check if a snapshot should be saved inside a time-integration loop.
+        """Check if a snapshot should be saved after the S Operator of the
+        submodel.
 
         This method checks if a snapshot should be saved right now, based on the
         provided timestamp and passed wallclock time.
@@ -487,7 +494,7 @@ def should_save_snapshot(self, timestamp: float) -> bool:
         return self._snapshot_manager.should_save_snapshot(timestamp)
 
     def save_snapshot(self, message: Message) -> None:
-        """Save a snapshot inside a time-integration loop.
+        """Save a snapshot after the S Operator of the submodel.
 
         Before saving a snapshot, you should check using
         :meth:`should_save_snapshot` if a snapshot should be saved according to
@@ -513,14 +520,13 @@ def save_snapshot(self, message: Message) -> None:
         return self._snapshot_manager.save_snapshot(message)
 
     def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool:
-        """Check if a snapshot should be saved before O_F.
+        """Check if a snapshot should be saved at the end of the reuse loop.
 
-        This method checks if a snapshot should be saved right now, based on the
-        provided timestamp and passed wallclock time.
+        This method checks if a snapshot should be saved now.
 
         When this method returns True, the submodel must also save a snapshot
-        through :meth:`save_final_snapshot`. A RuntimeError will be generated
-        when not doing so.
+        through :meth:`save_final_snapshot`. A :class:`RuntimeError` will be
+        generated when not doing so.
 
         See also :meth:`should_save_snapshot` for the variant that may be called
         inside of a time-integration loop of the submodel.
@@ -528,7 +534,7 @@ def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool:
         .. note::
             This method will block until it can determine whether a final
             snapshot should be taken. This means it must also determine if this
-            instance is reused. The optional keword-only argument
+            instance is reused. The optional keyword-only argument
             `apply_overlay` has the same meaning as for :meth:`reuse_instance`.
 
         Args:
@@ -536,10 +542,10 @@ def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool:
                 overlay or to save it. If you're going to use
                 :meth:`receive_with_settings` on your F_INIT ports, set this to
                 False. If you don't know what that means, just call
-                `reuse_instance()` without specifying this and everything will
-                be fine. If it turns out that you did need to specify False,
-                MUSCLE3 will tell you about it in an error message and you can
-                add it still.
+                :meth:`should_save_final_snapshot()` without specifying this and
+                everything will be fine. If it turns out that you did need to
+                specify False, MUSCLE3 will tell you about it in an error
+                message and you can add it still.
 
         Returns:
             True iff a final snapshot should be taken by the submodel according
@@ -554,7 +560,7 @@ def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool:
                 self._do_reuse, self.__f_init_max_timestamp)
 
     def save_final_snapshot(self, message: Message) -> None:
-        """Save a snapshot before O_F.
+        """Save a snapshot at the end of the reuse loop.
 
         Before saving a snapshot, you should check using
         :meth:`should_save_final_snapshot` if a snapshot should be saved
@@ -566,8 +572,8 @@ def save_final_snapshot(self, message: Message) -> None:
         submodels of the run (and therefore it is not useful to restart from).
         It could also lead to a lot of snapshot files clogging your file system.
 
-        See also :meth:`save_snapshot` for the variant that may be called inside
-        of a time-integration loop of the submodel.
+        See also :meth:`save_snapshot` for the variant that may be called after
+        each S Operator of the submodel.
 
         Args:
             message: Message object that is saved as snapshot. The data

From fea0e74b1305210d1128da60bdccab2d8e25dde7 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 24 Nov 2022 17:14:54 +0100
Subject: [PATCH 094/183] Update doxyfile

- `doxygen -u`
- disable html output
- ignore tests and bindings source folders
---
 Doxyfile | 242 +++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 164 insertions(+), 78 deletions(-)

diff --git a/Doxyfile b/Doxyfile
index f0c512dd..a8b7d7b2 100644
--- a/Doxyfile
+++ b/Doxyfile
@@ -1,4 +1,4 @@
-# Doxyfile 1.8.13
+# Doxyfile 1.8.17
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -17,11 +17,11 @@
 # Project related configuration options
 #---------------------------------------------------------------------------
 
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
 # The default value is: UTF-8.
 
 DOXYFILE_ENCODING      = UTF-8
@@ -93,6 +93,14 @@ ALLOW_UNICODE_NAMES    = NO
 
 OUTPUT_LANGUAGE        = English
 
+# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all generated output in the proper direction.
+# Possible values are: None, LTR, RTL and Context.
+# The default value is: None.
+
+OUTPUT_TEXT_DIRECTION  = None
+
 # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
 # descriptions after the members that are listed in the file and class
 # documentation (similar to Javadoc). Set to NO to disable this.
@@ -189,6 +197,16 @@ SHORT_NAMES            = NO
 
 JAVADOC_AUTOBRIEF      = YES
 
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = NO
+
 # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
 # line (until the first dot) of a Qt-style comment as the brief description. If
 # set to NO, the Qt-style will behave just like regular Qt-style comments (thus
@@ -236,7 +254,12 @@ TAB_SIZE               = 4
 # will allow you to put the command \sideeffect (or @sideeffect) in the
 # documentation, which will result in a user-defined paragraph with heading
 # "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
+# newlines (in the resulting output). You can put ^^ in the value part of an
+# alias to insert a newline as if a physical newline was in the original file.
+# When you need a literal { or } or , in the value part of an alias you have to
+# escape them by means of a backslash (\), this can lead to conflicts with the
+# commands \{ and \} for these it is advised to use the version @{ and @} or use
+# a double escape (\\{ and \\})
 
 ALIASES                =
 
@@ -274,17 +297,26 @@ OPTIMIZE_FOR_FORTRAN   = NO
 
 OPTIMIZE_OUTPUT_VHDL   = NO
 
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
 # Doxygen selects the parser to use depending on the extension of the files it
 # parses. With this tag you can assign which parser to use for a given
 # extension. Doxygen has a built-in mapping, but you can override or extend it
 # using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
+# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files), VHDL, tcl. For instance to make doxygen treat
+# .inc files as Fortran files (default is PHP), and .f files as C (default is
+# Fortran), use: inc=Fortran f=C.
 #
 # Note: For files without extension you can use no_extension as a placeholder.
 #
@@ -295,7 +327,7 @@ EXTENSION_MAPPING      =
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
 # The output of markdown processing is further processed by doxygen, so you can
 # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
 # case of backward compatibilities issues.
@@ -307,7 +339,7 @@ MARKDOWN_SUPPORT       = YES
 # to that level are automatically included in the table of contents, even if
 # they do not have an id attribute.
 # Note: This feature currently applies only to Markdown headings.
-# Minimum value: 0, maximum value: 99, default value: 0.
+# Minimum value: 0, maximum value: 99, default value: 5.
 # This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
 
 TOC_INCLUDE_HEADINGS   = 0
@@ -337,7 +369,7 @@ BUILTIN_STL_SUPPORT    = NO
 CPP_CLI_SUPPORT        = NO
 
 # Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
 # will parse them like normal C++ but will assume all classes use public instead
 # of private inheritance when no explicit protection keyword is present.
 # The default value is: NO.
@@ -443,6 +475,12 @@ EXTRACT_ALL            = NO
 
 EXTRACT_PRIVATE        = NO
 
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL   = NO
+
 # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
 # scope will be included in the documentation.
 # The default value is: NO.
@@ -497,8 +535,8 @@ HIDE_UNDOC_MEMBERS     = NO
 HIDE_UNDOC_CLASSES     = NO
 
 # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO, these declarations will be
-# included in the documentation.
+# declarations. If set to NO, these declarations will be included in the
+# documentation.
 # The default value is: NO.
 
 HIDE_FRIEND_COMPOUNDS  = YES
@@ -521,7 +559,7 @@ INTERNAL_DOCS          = NO
 # names in lower-case letters. If set to YES, upper-case letters are also
 # allowed. This is useful if you have classes or files whose names only differ
 # in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
+# (including Cygwin) ands Mac users are advised to set this option to NO.
 # The default value is: system dependent.
 
 CASE_SENSE_NAMES       = YES
@@ -708,7 +746,7 @@ LAYOUT_FILE            =
 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
 # the reference definitions. This must be a list of .bib files. The .bib
 # extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
 # For LaTeX the style of the bibliography can be controlled using
 # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
 # search path. See also \cite for info how to create references.
@@ -753,7 +791,8 @@ WARN_IF_DOC_ERROR      = YES
 # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
 # are documented, but have no documentation for their parameters or return
 # value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
+# parameter documentation, but not about the absence of documentation. If
+# EXTRACT_ALL is set to YES then this flag will automatically be disabled.
 # The default value is: NO.
 
 WARN_NO_PARAMDOC       = NO
@@ -795,7 +834,7 @@ INPUT                  = libmuscle/cpp/src
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
 # libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# documentation (see: https://www.gnu.org/software/libiconv/) for the list of
 # possible encodings.
 # The default value is: UTF-8.
 
@@ -812,8 +851,10 @@ INPUT_ENCODING         = UTF-8
 # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
 # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
 # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
-# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf.
+# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment),
+# *.doc (to be provided as doxygen C comment), *.txt (to be provided as doxygen
+# C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f, *.for, *.tcl, *.vhd,
+# *.vhdl, *.ucf, *.qsf and *.ice.
 
 FILE_PATTERNS          = *.c \
                          *.cc \
@@ -873,7 +914,10 @@ RECURSIVE              = YES
 # Note that relative paths are relative to the directory from which doxygen is
 # run.
 
-EXCLUDE                = libmuscle/cpp/src/muscle_manager_protocol
+EXCLUDE                = libmuscle/cpp/src/libmuscle/bindings \
+                         libmuscle/cpp/src/libmuscle/tests \
+                         libmuscle/cpp/src/ymmsl/bindings \
+                         libmuscle/cpp/src/ymmsl/tests
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@@ -1011,7 +1055,7 @@ INLINE_SOURCES         = NO
 STRIP_CODE_COMMENTS    = YES
 
 # If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
+# entity all documented functions referencing it will be listed.
 # The default value is: NO.
 
 REFERENCED_BY_RELATION = NO
@@ -1043,12 +1087,12 @@ SOURCE_TOOLTIPS        = YES
 # If the USE_HTAGS tag is set to YES then the references to source code will
 # point to the HTML generated by the htags(1) tool instead of doxygen built-in
 # source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
+# (see https://www.gnu.org/software/global/global.html). You will need version
 # 4.8.6 or higher.
 #
 # To use it do the following:
 # - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
 # - Make sure the INPUT points to the root of the source tree
 # - Run doxygen as normal
 #
@@ -1076,7 +1120,7 @@ VERBATIM_HEADERS       = YES
 # rich C++ code for which doxygen's built-in parser lacks the necessary type
 # information.
 # Note: The availability of this option depends on whether or not doxygen was
-# generated with the -Duse-libclang=ON option for CMake.
+# generated with the -Duse_libclang=ON option for CMake.
 # The default value is: NO.
 
 CLANG_ASSISTED_PARSING = NO
@@ -1089,6 +1133,16 @@ CLANG_ASSISTED_PARSING = NO
 
 CLANG_OPTIONS          =
 
+# If clang assisted parsing is enabled you can provide the clang parser with the
+# path to the compilation database (see:
+# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files
+# were built. This is equivalent to specifying the "-p" option to a clang tool,
+# such as clang-check. These options will then be passed to the parser.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse_libclang=ON option for CMake.
+
+CLANG_DATABASE_PATH    =
+
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
 #---------------------------------------------------------------------------
@@ -1122,7 +1176,7 @@ IGNORE_PREFIX          =
 # If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
 # The default value is: YES.
 
-GENERATE_HTML          = YES
+GENERATE_HTML          = NO
 
 # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
 # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
@@ -1207,7 +1261,7 @@ HTML_EXTRA_FILES       =
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
 # this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
 # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
 # purple, and 360 is red again.
 # Minimum value: 0, maximum value: 359, default value: 220.
@@ -1243,6 +1297,17 @@ HTML_COLORSTYLE_GAMMA  = 80
 
 HTML_TIMESTAMP         = NO
 
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via JavaScript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have JavaScript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
 # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
 # documentation will contain sections that can be hidden and shown after the
 # page has loaded.
@@ -1266,13 +1331,13 @@ HTML_INDEX_NUM_ENTRIES = 100
 
 # If the GENERATE_DOCSET tag is set to YES, additional index files will be
 # generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# environment (see: https://developer.apple.com/xcode/), introduced with OSX
+# 10.5 (Leopard). To create a documentation set, doxygen will generate a
 # Makefile in the HTML output directory. Running make will produce the docset in
 # that directory and running make install will install the docset in
 # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
@@ -1311,7 +1376,7 @@ DOCSET_PUBLISHER_NAME  = Publisher
 # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
 # additional HTML index files: index.hhp, index.hhc, and index.hhk. The
 # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on
 # Windows.
 #
 # The HTML Help Workshop contains a compiler that can convert all HTML output
@@ -1387,7 +1452,7 @@ QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
 # The default value is: org.doxygen.Project.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1395,7 +1460,7 @@ QHP_NAMESPACE          = org.doxygen.Project
 
 # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
 # Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-
 # folders).
 # The default value is: doc.
 # This tag requires that the tag GENERATE_QHP is set to YES.
@@ -1404,7 +1469,7 @@ QHP_VIRTUAL_FOLDER     = doc
 
 # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
 # filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1412,7 +1477,7 @@ QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-
 # filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1420,7 +1485,7 @@ QHP_CUST_FILTER_ATTRS  =
 
 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
 # project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_SECT_FILTER_ATTRS  =
@@ -1513,7 +1578,7 @@ EXT_LINKS_IN_WINDOW    = NO
 
 FORMULA_FONTSIZE       = 10
 
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
 # generated for formulas are transparent PNGs. Transparent PNGs are not
 # supported properly for IE 6.0, but are supported on all modern browsers.
 #
@@ -1524,8 +1589,14 @@ FORMULA_FONTSIZE       = 10
 
 FORMULA_TRANSPARENT    = YES
 
+# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
+# to create new LaTeX commands to be used in formulas as building blocks. See
+# the section "Including formulas" for details.
+
+FORMULA_MACROFILE      =
+
 # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
+# https://www.mathjax.org) which uses client side JavaScript for the rendering
 # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
 # installed or if you want to formulas look prettier in the HTML output. When
 # enabled you may also need to install MathJax separately and configure the path
@@ -1552,8 +1623,8 @@ MATHJAX_FORMAT         = HTML-CSS
 # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
 # Content Delivery Network so you can quickly see the result without installing
 # MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# MathJax from https://www.mathjax.org before deployment.
+# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
 MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
@@ -1595,7 +1666,7 @@ MATHJAX_CODEFILE       =
 SEARCHENGINE           = YES
 
 # When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
+# implemented using a web server instead of a web client using JavaScript. There
 # are two flavors of web server based searching depending on the EXTERNAL_SEARCH
 # setting. When disabled, doxygen will generate a PHP script for searching and
 # an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
@@ -1614,7 +1685,7 @@ SERVER_BASED_SEARCH    = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
+# Xapian (see: https://xapian.org/).
 #
 # See the section "External Indexing and Searching" for details.
 # The default value is: NO.
@@ -1627,7 +1698,7 @@ EXTERNAL_SEARCH        = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
+# Xapian (see: https://xapian.org/). See the section "External Indexing and
 # Searching" for details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
@@ -1679,21 +1750,35 @@ LATEX_OUTPUT           = latex
 # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
 # invoked.
 #
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_CMD_NAME         = latex
 
 # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
 # index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
 # The default file is: makeindex.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 MAKEINDEX_CMD_NAME     = makeindex
 
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
 # If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
 # documents. This may be useful for small projects and may help to save some
 # trees in general.
@@ -1814,7 +1899,7 @@ LATEX_SOURCE_CODE      = NO
 
 # The LATEX_BIB_STYLE tag can be used to specify the style to use for the
 # bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
 # The default value is: plain.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -1828,6 +1913,14 @@ LATEX_BIB_STYLE        = plain
 
 LATEX_TIMESTAMP        = NO
 
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
 #---------------------------------------------------------------------------
 # Configuration options related to the RTF output
 #---------------------------------------------------------------------------
@@ -1867,9 +1960,9 @@ COMPACT_RTF            = NO
 
 RTF_HYPERLINKS         = NO
 
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
 #
 # See also section "Doxygen usage" for information on how to generate the
 # default style sheet that doxygen normally uses.
@@ -1878,8 +1971,8 @@ RTF_HYPERLINKS         = NO
 RTF_STYLESHEET_FILE    =
 
 # Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
 RTF_EXTENSIONS_FILE    =
@@ -1965,6 +2058,13 @@ XML_OUTPUT             = xml
 
 XML_PROGRAMLISTING     = YES
 
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
 #---------------------------------------------------------------------------
 # Configuration options related to the DOCBOOK output
 #---------------------------------------------------------------------------
@@ -1997,9 +2097,9 @@ DOCBOOK_PROGRAMLISTING = NO
 #---------------------------------------------------------------------------
 
 # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sf.net) file that captures the
-# structure of the code including all documentation. Note that this feature is
-# still experimental and incomplete at the moment.
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
 # The default value is: NO.
 
 GENERATE_AUTOGEN_DEF   = NO
@@ -2099,7 +2199,8 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             = MUSCLE_ENABLE_MPI DOXYGEN_SHOULD_SKIP_THIS
+PREDEFINED             = MUSCLE_ENABLE_MPI \
+                         DOXYGEN_SHOULD_SKIP_THIS
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
@@ -2166,12 +2267,6 @@ EXTERNAL_GROUPS        = YES
 
 EXTERNAL_PAGES         = YES
 
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
 #---------------------------------------------------------------------------
 # Configuration options related to the dot tool
 #---------------------------------------------------------------------------
@@ -2185,15 +2280,6 @@ PERL_PATH              = /usr/bin/perl
 
 CLASS_DIAGRAMS         = YES
 
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
 # You can include diagrams made with dia in doxygen documentation. Doxygen will
 # then run dia to produce the diagram and insert it in the documentation. The
 # DIA_PATH tag allows you to specify the directory where the dia binary resides.

From d5c56830dcf4ad8972deac4751f28808a565651b Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 25 Nov 2022 12:58:45 +0100
Subject: [PATCH 095/183] Pin flake8 to <6.0.0 for the time being

---
 tox.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tox.ini b/tox.ini
index fcaa0c30..53283fb3 100644
--- a/tox.ini
+++ b/tox.ini
@@ -5,7 +5,7 @@ skip_missing_interpreters = true
 [testenv]
 deps =
     mypy
-    flake8
+    flake8<6.0.0
     pytest
     pytest-cov
     git+https://github.com/multiscale/ymmsl-python.git@feature/checkpointing#egg=ymmsl

From d6f530fee585499915dea26e16f88719daf304bf Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 25 Nov 2022 15:40:30 +0100
Subject: [PATCH 096/183] Update consistency check for restart from 0

---
 integration_test/test_snapshot_macro_micro.py | 93 +++++++++++++------
 .../libmuscle/manager/snapshot_registry.py    | 36 ++++---
 .../manager/test/test_snapshot_registry.py    | 37 +++++---
 3 files changed, 114 insertions(+), 52 deletions(-)

diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py
index f9c14103..ae7e8f27 100644
--- a/integration_test/test_snapshot_macro_micro.py
+++ b/integration_test/test_snapshot_macro_micro.py
@@ -145,6 +145,17 @@ def stateless_micro():
         instance.send('o_f', Message(t_cur, None, i))
 
 
+def data_transformer():
+    instance = Instance({
+            Operator.F_INIT: ['f_i'],
+            Operator.O_F: ['o_f']},
+            stateful=ImplementationState.STATELESS)
+
+    while instance.reuse_instance():
+        msg = instance.receive('f_i')
+        instance.send('o_f', msg)
+
+
 @pytest.fixture
 def base_config():
     return load(f"""ymmsl_version: v0.1
@@ -169,11 +180,27 @@ def base_config():
   - every: 0.4""")
 
 
+@pytest.fixture
+def config_with_transformer(base_config):
+    base_config.update(load("""ymmsl_version: v0.1
+model:
+  name: test_snapshot
+  components:
+    transformer1: transformer
+    transformer2: transformer
+  conduits:
+    macro.o_i: transformer1.f_i
+    transformer1.o_f: micro.f_i
+    micro.o_f: transformer2.f_i
+    transformer2.o_f: macro.s"""))
+    return base_config
+
+
 def test_snapshot_macro_micro(tmp_path, base_config):
+    actors = {'macro': macro, 'micro': micro}
     run_dir1 = RunDir(tmp_path / 'run1')
     run_manager_with_actors(
-            dump(base_config), run_dir1.path,
-            python_actors={'macro': macro, 'micro': micro})
+            dump(base_config), run_dir1.path, python_actors=actors)
 
     # Note: sorted only works because we have fewer than 10 snapshots, otherwise
     # _10 would be sorted right after _1
@@ -183,22 +210,20 @@ def test_snapshot_macro_micro(tmp_path, base_config):
     assert len(micro_snapshots) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
     snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl'))
     snapshot_docs = list(map(load, snapshots_ymmsl))
-    assert 'macro' not in snapshot_docs[0].resume
+    assert len(snapshot_docs) == 7
+    assert snapshot_docs[0].resume['macro'] == macro_snapshots[0]
     assert snapshot_docs[0].resume['micro'] == micro_snapshots[0]
     assert snapshot_docs[1].resume['macro'] == macro_snapshots[0]
-    assert snapshot_docs[1].resume['micro'] == micro_snapshots[0]
-    assert snapshot_docs[2].resume['macro'] == macro_snapshots[0]
-    assert snapshot_docs[2].resume['micro'] == micro_snapshots[1]
-    for i in range(3, 8):
-        assert snapshot_docs[i].resume['macro'] == macro_snapshots[i - 2]
-        assert snapshot_docs[i].resume['micro'] == micro_snapshots[i - 2]
+    assert snapshot_docs[1].resume['micro'] == micro_snapshots[1]
+    for i in range(2, 7):
+        assert snapshot_docs[i].resume['macro'] == macro_snapshots[i - 1]
+        assert snapshot_docs[i].resume['micro'] == micro_snapshots[i - 1]
 
     # resume from the snapshots taken at t>=1.2
     run_dir2 = RunDir(tmp_path / 'run2')
-    base_config.update(snapshot_docs[5])  # add resume info
+    base_config.update(snapshot_docs[4])  # add resume info
     run_manager_with_actors(
-            dump(base_config), run_dir2.path,
-            python_actors={'macro': macro, 'micro': micro})
+            dump(base_config), run_dir2.path, python_actors=actors)
 
     macro_snapshots = sorted(run_dir2.snapshot_dir().glob('macro*'))
     assert len(macro_snapshots) == 2  # 1.6, final
@@ -213,15 +238,14 @@ def test_snapshot_macro_micro(tmp_path, base_config):
     base_config.update(snapshot_docs[0])        # add resume info
     base_config.settings['macro.t_max'] = 0.6   # run shorter
     run_manager_with_actors(
-            dump(base_config), run_dir3.path,
-            python_actors={'macro': macro, 'micro': micro})
+            dump(base_config), run_dir3.path, python_actors=actors)
 
 
 def test_snapshot_macro_stateless_micro(tmp_path, base_config):
+    actors = {'macro': macro, 'micro': stateless_micro}
     run_dir1 = RunDir(tmp_path / 'run1')
     run_manager_with_actors(
-            dump(base_config), run_dir1.path,
-            python_actors={'macro': macro, 'micro': stateless_micro})
+            dump(base_config), run_dir1.path, python_actors=actors)
 
     # Note: sorted only works because we have fewer than 10 snapshots, otherwise
     # _10 would be sorted right after _1
@@ -237,8 +261,7 @@ def test_snapshot_macro_stateless_micro(tmp_path, base_config):
     run_dir2 = RunDir(tmp_path / 'run2')
     base_config.update(snapshot_docs[3])  # add resume info
     run_manager_with_actors(
-            dump(base_config), run_dir2.path,
-            python_actors={'macro': macro, 'micro': stateless_micro})
+            dump(base_config), run_dir2.path, python_actors=actors)
 
     macro_snapshots = sorted(run_dir2.snapshot_dir().glob('macro*'))
     assert len(macro_snapshots) == 2  # 1.6, final
@@ -250,28 +273,23 @@ def test_snapshot_macro_stateless_micro(tmp_path, base_config):
 
 def test_snapshot_macro_vector_micro(tmp_path, base_config):
     base_config.model.components[1].multiplicity = [2]
+    actors = {'macro': macro_vector, 'micro[0]': micro, 'micro[1]': micro}
 
     run_dir1 = RunDir(tmp_path / 'run1')
     run_manager_with_actors(
-            dump(base_config), run_dir1.path,
-            python_actors={'macro': macro_vector,
-                           'micro[0]': micro,
-                           'micro[1]': micro})
+            dump(base_config), run_dir1.path, python_actors=actors)
 
     macro_snapshots = sorted(run_dir1.snapshot_dir().glob('macro*'))
     assert len(macro_snapshots) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
     micro_snapshots = sorted(run_dir1.snapshot_dir().glob('micro*'))
     assert len(micro_snapshots) == 6 * 2  # 0, 0.4, 0.8, 1.2, 1.6, final
     snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl'))
-    assert len(snapshots_ymmsl) == 10
+    assert len(snapshots_ymmsl) == 8
 
     run_dir2 = RunDir(tmp_path / 'run2')
     base_config.update(load(snapshots_ymmsl[-3]))  # add resume info
     run_manager_with_actors(
-            dump(base_config), run_dir2.path,
-            python_actors={'macro': macro_vector,
-                           'micro[0]': micro,
-                           'micro[1]': micro})
+            dump(base_config), run_dir2.path, python_actors=actors)
 
     macro_snapshots = sorted(run_dir2.snapshot_dir().glob('macro*'))
     assert len(macro_snapshots) == 2  # 1.6, final
@@ -279,3 +297,24 @@ def test_snapshot_macro_vector_micro(tmp_path, base_config):
     assert len(micro_snapshots) == 2 * 2  # 1.6, final
     snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl'))
     assert len(snapshots_ymmsl) == 2
+
+
+def test_snapshot_macro_transformer_micro(tmp_path, config_with_transformer):
+    actors = {'macro': macro, 'micro': micro, 'transformer1': data_transformer,
+              'transformer2': data_transformer}
+
+    run_dir1 = RunDir(tmp_path / 'run1')
+    run_manager_with_actors(
+            dump(config_with_transformer), run_dir1.path, python_actors=actors)
+
+    snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl'))
+    assert len(snapshots_ymmsl) == 8
+
+    # pick one to resume from
+    run_dir2 = RunDir(tmp_path / 'run2')
+    config_with_transformer.update(load(snapshots_ymmsl[4]))  # add resume info
+    run_manager_with_actors(
+            dump(config_with_transformer), run_dir2.path, python_actors=actors)
+
+    snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl'))
+    assert len(snapshots_ymmsl) == 3
diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py
index cbb8bbde..dcf7c3e8 100644
--- a/libmuscle/python/libmuscle/manager/snapshot_registry.py
+++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py
@@ -24,6 +24,9 @@
 _QueueItemType = Optional[Tuple[Reference, SnapshotMetadata]]
 _T = TypeVar("_T")
 
+# this snapshot is used as a placeholder for restarting from scratch
+_NULL_SNAPSHOT = SnapshotMetadata(["Instance start"], 0, 0, None, {}, True, '')
+
 
 def safe_get(lst: List[_T], index: int, default: _T) -> _T:
     """Get an item from the list, returning default when it does not exist.
@@ -45,40 +48,48 @@ class _ConnectionInfo(Flag):
     PEER_IS_VECTOR = auto()
 
 
-def calc_consistency(num1: int, num2: int, first_is_sent: bool) -> bool:
+def calc_consistency(
+        num1: int, num2: int, first_is_sent: bool, num2_is_restart: bool
+        ) -> bool:
     """Calculate consistency of message counts.
 
     Args:
         num1: message count of instance 1
         num2: message count of instance 2
         first_is_sent: True iff instance 1 is sending messages over this conduit
+        num2_is_restart: True iff the snapshot of num2 is a full restart
 
     Returns:
         True iff the two message counts are consistent
     """
     return (num1 == num2 or                             # strong
             num1 + 1 == num2 and first_is_sent or       # weak (1 = sent)
-            num2 + 1 == num1 and not first_is_sent)     # weak (2 = sent)
+            # weak (2 = sent) - only allow if num2 is not a restart
+            num2 + 1 == num1 and not first_is_sent and not num2_is_restart)
 
 
 def calc_consistency_list(
-        num1: List[int], num2: List[int], first_is_sent: bool) -> bool:
+        num1: List[int], num2: List[int], first_is_sent: bool,
+        num2_is_restart: bool) -> bool:
     """Calculate consistency of message counts.
 
     Args:
         num1: message count of instance 1
         num2: message count of instance 2
         first_is_sent: True iff instance 1 is sending messages over this conduit
+        num2_is_restart: True iff the snapshot of num2 is a full restart
 
     Returns:
         True iff the two message counts are consistent
     """
     if first_is_sent:
+        allow_weak = True
         slot_iter = zip_longest(num1, num2, fillvalue=0)
     else:
+        allow_weak = not num2_is_restart
         slot_iter = zip_longest(num2, num1, fillvalue=0)
-    return all(slot_sent == slot_received or        # strong
-               slot_sent + 1 == slot_received       # weak
+    return all(slot_sent == slot_received or                    # strong
+               slot_sent + 1 == slot_received and allow_weak    # weak
                for slot_sent, slot_received in slot_iter)
 
 
@@ -129,6 +140,7 @@ def do_consistency_check(
         """
         i_snapshot = self.snapshot
         p_snapshot = peer_node.snapshot
+        peer_is_restart = p_snapshot is _NULL_SNAPSHOT
         for connection in connections:
             i_port, p_port, conn = connection
             is_sending = bool(conn & _ConnectionInfo.SELF_IS_SENDING)
@@ -139,16 +151,16 @@ def do_consistency_check(
                 consistent = calc_consistency(
                         safe_get(i_msg_counts, slot, 0),
                         safe_get(p_msg_counts, 0, 0),
-                        is_sending)
+                        is_sending, peer_is_restart)
             elif conn & _ConnectionInfo.PEER_IS_VECTOR:
                 slot = int(self.instance[-1])
                 consistent = calc_consistency(
                         safe_get(i_msg_counts, 0, 0),
                         safe_get(p_msg_counts, slot, 0),
-                        is_sending)
+                        is_sending, peer_is_restart)
             else:
                 consistent = calc_consistency_list(
-                        i_msg_counts, p_msg_counts, is_sending)
+                        i_msg_counts, p_msg_counts, is_sending, peer_is_restart)
             if not consistent:  # not consistent
                 return False
         self.consistent_peers.setdefault(
@@ -196,10 +208,8 @@ def __init__(
             self._instances.update(component.instances())
 
         # Create snapshot nodes for starting from scratch
-        self._null_snapshot = SnapshotMetadata(
-                ["Instance start"], 0, 0, None, {}, True, '')
         for instance in self._instances:
-            self.register_snapshot(instance, self._null_snapshot)
+            self.register_snapshot(instance, _NULL_SNAPSHOT)
 
     def register_snapshot(
             self, instance: Reference, snapshot: SnapshotMetadata) -> None:
@@ -248,7 +258,7 @@ def _add_snapshot(
                         peer_snapshot, self._get_connections(instance, peer))
 
         # finally, check if this snapshotnode is now part of a workflow snapshot
-        if snapshot is not self._null_snapshot:
+        if snapshot is not _NULL_SNAPSHOT:
             self._save_workflow_snapshot(snapshotnode)
 
     def _save_workflow_snapshot(self, snapshotnode: SnapshotNode) -> None:
@@ -408,7 +418,7 @@ def _generate_snapshot_config(
         selected_snapshots.sort(key=attrgetter('instance'))
         resume = {}
         for node in selected_snapshots:
-            if node.snapshot is not self._null_snapshot:
+            if node.snapshot is not _NULL_SNAPSHOT:
                 # Only store resume information when it is an actual snapshot
                 # created by the instance. Otherwise the instance can just be
                 # restarted from the beginning.
diff --git a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py
index dd6c0c46..6b9838e6 100644
--- a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py
+++ b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py
@@ -8,8 +8,8 @@
         ImplementationState as IState, Reference)
 
 from libmuscle.manager.snapshot_registry import (
-    SnapshotNode, SnapshotRegistry, calc_consistency, calc_consistency_list, safe_get,
-    _ConnectionInfo)
+    SnapshotNode, SnapshotRegistry, calc_consistency, calc_consistency_list,
+    safe_get, _ConnectionInfo)
 from libmuscle.manager.topology_store import TopologyStore
 from libmuscle.snapshot import SnapshotMetadata
 
@@ -78,26 +78,39 @@ def test_safe_get() -> None:
 def test_calc_consistency() -> None:
     num_sent = 3
     for num_received in [2, 3, 4, 5]:
-        consistent = num_received in [3, 4]
-        assert calc_consistency(num_sent, num_received, True) is consistent
-        assert calc_consistency(num_received, num_sent, False) is consistent
+        expect = num_received in [3, 4]
+        assert calc_consistency(num_sent, num_received, True, False) is expect
+        assert calc_consistency(num_received, num_sent, False, False) is expect
 
     num_received = 10
     for num_sent in [8, 9, 10, 11]:
-        consistent = num_sent in [9, 10]
-        assert calc_consistency(num_sent, num_received, True) is consistent
-        assert calc_consistency(num_received, num_sent, False) is consistent
+        expect = num_sent in [9, 10]
+        assert calc_consistency(num_sent, num_received, True, False) is expect
+        assert calc_consistency(num_received, num_sent, False, False) is expect
+
+
+def test_calc_consistency_with_restart() -> None:
+    # Check normal rules
+    assert calc_consistency(0, 0, True, True)
+    assert calc_consistency(0, 0, False, True)
+    assert not calc_consistency(1, 0, True, True)
+    assert not calc_consistency(1, 0, True, False)
+    assert calc_consistency(1, 0, False, False)
+    # Different: num2 == 0 comes from the restarted actor, we do not want a
+    # resume file to be created in this instance (because an instance further in
+    # the call chain is ahead of the one that would be restarted):
+    assert not calc_consistency(1, 0, False, True)
 
 
 def test_calc_consistency_list() -> None:
     num_sent = [3, 3]
     for num_received in [[2, 3], [3, 2], [3, 5], [], [4, 4, 0, 0, 2]]:
-        assert not calc_consistency_list(num_sent, num_received, True)
-        assert not calc_consistency_list(num_received, num_sent, False)
+        assert not calc_consistency_list(num_sent, num_received, True, False)
+        assert not calc_consistency_list(num_received, num_sent, False, False)
     for num_received in [[3, 3], [3, 4], [4, 3], [4, 4],
                          [3, 3, 1], [4, 4, 0, 0, 0, 1, 0, 1]]:
-        assert calc_consistency_list(num_sent, num_received, True)
-        assert calc_consistency_list(num_received, num_sent, False)
+        assert calc_consistency_list(num_sent, num_received, True, False)
+        assert calc_consistency_list(num_received, num_sent, False, False)
 
 
 def test_write_ymmsl(tmp_path: Path):

From 4d2cfc2ba12fd0a4a26c4ca66766c7b0cfcaca5a Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 28 Nov 2022 10:53:28 +0100
Subject: [PATCH 097/183] Update snapshot_directory:

- When a run_dir is provided, each instance has a unique snapshot folder
- Snapshot_directory is fixed for a whole run and provided to instances
  in get_checkpoint_info() instead of the settings
- Fallback snapshot directory is the cwd() of the instance (warning generated in muscle manager)
---
 integration_test/test_snapshot_macro_micro.py | 48 ++++++++++---------
 libmuscle/python/libmuscle/instance.py        | 11 +----
 libmuscle/python/libmuscle/manager/manager.py |  7 +--
 .../python/libmuscle/manager/mmp_server.py    | 23 +++++++--
 libmuscle/python/libmuscle/manager/run_dir.py |  2 +-
 .../python/libmuscle/manager/test/conftest.py | 10 ++--
 .../manager/test/test_mmp_request_handler.py  | 19 +++++++-
 libmuscle/python/libmuscle/mmp_client.py      | 17 +++++--
 .../python/libmuscle/snapshot_manager.py      | 13 ++---
 .../python/libmuscle/test/test_instance.py    | 15 +++---
 .../libmuscle/test/test_snapshot_manager.py   | 26 +++++-----
 11 files changed, 108 insertions(+), 83 deletions(-)

diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py
index ae7e8f27..0420df98 100644
--- a/integration_test/test_snapshot_macro_micro.py
+++ b/integration_test/test_snapshot_macro_micro.py
@@ -204,11 +204,11 @@ def test_snapshot_macro_micro(tmp_path, base_config):
 
     # Note: sorted only works because we have fewer than 10 snapshots, otherwise
     # _10 would be sorted right after _1
-    macro_snapshots = sorted(run_dir1.snapshot_dir().glob('macro*'))
+    macro_snapshots = sorted(run_dir1.snapshot_dir('macro').iterdir())
     assert len(macro_snapshots) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
-    micro_snapshots = sorted(run_dir1.snapshot_dir().glob('micro*'))
+    micro_snapshots = sorted(run_dir1.snapshot_dir('micro').iterdir())
     assert len(micro_snapshots) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
-    snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl'))
+    snapshots_ymmsl = sorted(run_dir1.snapshot_dir().iterdir())
     snapshot_docs = list(map(load, snapshots_ymmsl))
     assert len(snapshot_docs) == 7
     assert snapshot_docs[0].resume['macro'] == macro_snapshots[0]
@@ -225,11 +225,11 @@ def test_snapshot_macro_micro(tmp_path, base_config):
     run_manager_with_actors(
             dump(base_config), run_dir2.path, python_actors=actors)
 
-    macro_snapshots = sorted(run_dir2.snapshot_dir().glob('macro*'))
+    macro_snapshots = sorted(run_dir2.snapshot_dir('macro').iterdir())
     assert len(macro_snapshots) == 2  # 1.6, final
-    micro_snapshots = sorted(run_dir2.snapshot_dir().glob('micro*'))
+    micro_snapshots = sorted(run_dir2.snapshot_dir('micro').iterdir())
     assert len(micro_snapshots) == 2  # 1.6, final
-    snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl'))
+    snapshots_ymmsl = sorted(run_dir2.snapshot_dir().iterdir())
     assert len(snapshots_ymmsl) == 2
 
     # resume from the first workflow snapshot (this restarts macro from scratch)
@@ -249,11 +249,11 @@ def test_snapshot_macro_stateless_micro(tmp_path, base_config):
 
     # Note: sorted only works because we have fewer than 10 snapshots, otherwise
     # _10 would be sorted right after _1
-    macro_snapshots = sorted(run_dir1.snapshot_dir().glob('macro*'))
+    macro_snapshots = sorted(run_dir1.snapshot_dir('macro').iterdir())
     assert len(macro_snapshots) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
-    micro_snapshots = sorted(run_dir1.snapshot_dir().glob('micro*'))
+    micro_snapshots = sorted(run_dir1.snapshot_dir('micro').iterdir())
     assert len(micro_snapshots) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
-    snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl'))
+    snapshots_ymmsl = sorted(run_dir1.snapshot_dir().iterdir())
     snapshot_docs = list(map(load, snapshots_ymmsl))
     assert len(snapshot_docs) == 6
 
@@ -263,11 +263,11 @@ def test_snapshot_macro_stateless_micro(tmp_path, base_config):
     run_manager_with_actors(
             dump(base_config), run_dir2.path, python_actors=actors)
 
-    macro_snapshots = sorted(run_dir2.snapshot_dir().glob('macro*'))
+    macro_snapshots = sorted(run_dir2.snapshot_dir('macro').iterdir())
     assert len(macro_snapshots) == 2  # 1.6, final
-    micro_snapshots = sorted(run_dir2.snapshot_dir().glob('micro*'))
+    micro_snapshots = sorted(run_dir2.snapshot_dir('micro').iterdir())
     assert len(micro_snapshots) == 3  # 1.2, 1.6, final
-    snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl'))
+    snapshots_ymmsl = sorted(run_dir2.snapshot_dir().iterdir())
     assert len(snapshots_ymmsl) == 2
 
 
@@ -279,11 +279,13 @@ def test_snapshot_macro_vector_micro(tmp_path, base_config):
     run_manager_with_actors(
             dump(base_config), run_dir1.path, python_actors=actors)
 
-    macro_snapshots = sorted(run_dir1.snapshot_dir().glob('macro*'))
+    macro_snapshots = sorted(run_dir1.snapshot_dir('macro').iterdir())
     assert len(macro_snapshots) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
-    micro_snapshots = sorted(run_dir1.snapshot_dir().glob('micro*'))
-    assert len(micro_snapshots) == 6 * 2  # 0, 0.4, 0.8, 1.2, 1.6, final
-    snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl'))
+    micro_snapshots = sorted(run_dir1.snapshot_dir('micro[0]').iterdir())
+    assert len(micro_snapshots) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
+    micro_snapshots = sorted(run_dir1.snapshot_dir('micro[1]').iterdir())
+    assert len(micro_snapshots) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
+    snapshots_ymmsl = sorted(run_dir1.snapshot_dir().iterdir())
     assert len(snapshots_ymmsl) == 8
 
     run_dir2 = RunDir(tmp_path / 'run2')
@@ -291,11 +293,13 @@ def test_snapshot_macro_vector_micro(tmp_path, base_config):
     run_manager_with_actors(
             dump(base_config), run_dir2.path, python_actors=actors)
 
-    macro_snapshots = sorted(run_dir2.snapshot_dir().glob('macro*'))
+    macro_snapshots = sorted(run_dir2.snapshot_dir('macro').iterdir())
     assert len(macro_snapshots) == 2  # 1.6, final
-    micro_snapshots = sorted(run_dir2.snapshot_dir().glob('micro*'))
-    assert len(micro_snapshots) == 2 * 2  # 1.6, final
-    snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl'))
+    micro_snapshots = sorted(run_dir2.snapshot_dir('micro[0]').iterdir())
+    assert len(micro_snapshots) == 2  # 1.6, final
+    micro_snapshots = sorted(run_dir2.snapshot_dir('micro[1]').iterdir())
+    assert len(micro_snapshots) == 2  # 1.6, final
+    snapshots_ymmsl = sorted(run_dir2.snapshot_dir().iterdir())
     assert len(snapshots_ymmsl) == 2
 
 
@@ -307,7 +311,7 @@ def test_snapshot_macro_transformer_micro(tmp_path, config_with_transformer):
     run_manager_with_actors(
             dump(config_with_transformer), run_dir1.path, python_actors=actors)
 
-    snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl'))
+    snapshots_ymmsl = sorted(run_dir1.snapshot_dir().iterdir())
     assert len(snapshots_ymmsl) == 8
 
     # pick one to resume from
@@ -316,5 +320,5 @@ def test_snapshot_macro_transformer_micro(tmp_path, config_with_transformer):
     run_manager_with_actors(
             dump(config_with_transformer), run_dir2.path, python_actors=actors)
 
-    snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl'))
+    snapshots_ymmsl = sorted(run_dir2.snapshot_dir().iterdir())
     assert len(snapshots_ymmsl) == 3
diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index 03f3d494..48bb2905 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -1,7 +1,6 @@
 from copy import copy
 import logging
 import os
-from pathlib import Path
 import sys
 from typing import cast, Dict, List, Optional, Tuple, overload
 # TODO: import from typing module when dropping support for python 3.7
@@ -150,16 +149,8 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool:
             do_reuse = self.__check_reuse_instance(apply_overlay)
         self._do_reuse = None
 
-        # Note: muscle_snapshot_directory setting is provided by muscle_manager
-        # when checkpointing is enabled for this run. When checkpointing is not
-        # enabled, it might not exist and a KeyError is raised.
-        try:
-            snapshot_dir = self.get_setting('muscle_snapshot_directory', 'str')
-            snapshot_path = Path(snapshot_dir)
-        except KeyError:
-            snapshot_path = None
         self._snapshot_manager.reuse_instance(
-                snapshot_path, do_reuse, self.__f_init_max_timestamp)
+                do_reuse, self.__f_init_max_timestamp)
 
         if not do_reuse:
             self.__close_ports()
diff --git a/libmuscle/python/libmuscle/manager/manager.py b/libmuscle/python/libmuscle/manager/manager.py
index d96842a7..0ec1da3b 100644
--- a/libmuscle/python/libmuscle/manager/manager.py
+++ b/libmuscle/python/libmuscle/manager/manager.py
@@ -57,11 +57,6 @@ def __init__(
                     self._configuration,
                     self._run_dir.path / 'configuration.ymmsl')
 
-        # TODO: decide if this should be a setting or part of checkpoint_info
-        # TODO: separate folder per intance
-        self._configuration.settings.setdefault(
-                'muscle_snapshot_directory', str(snapshot_dir))
-
         self._instance_manager = None    # type: Optional[InstanceManager]
         try:
             configuration = self._configuration.as_configuration()
@@ -80,7 +75,7 @@ def __init__(
         self._server = MMPServer(
                 self._logger, self._configuration,
                 self._instance_registry, self._topology_store,
-                self._snapshot_registry)
+                self._snapshot_registry, run_dir)
 
         if self._instance_manager:
             self._instance_manager.set_manager_location(
diff --git a/libmuscle/python/libmuscle/manager/mmp_server.py b/libmuscle/python/libmuscle/manager/mmp_server.py
index 9382d0eb..90617fae 100644
--- a/libmuscle/python/libmuscle/manager/mmp_server.py
+++ b/libmuscle/python/libmuscle/manager/mmp_server.py
@@ -1,7 +1,7 @@
 from datetime import datetime, timezone
 import errno
 import logging
-from typing import Any, Dict, cast, List
+from typing import Any, Dict, cast, List, Optional
 
 import msgpack
 from ymmsl import (
@@ -12,6 +12,7 @@
 from libmuscle.manager.instance_registry import (
         AlreadyRegistered, InstanceRegistry)
 from libmuscle.manager.logger import Logger
+from libmuscle.manager.run_dir import RunDir
 from libmuscle.manager.snapshot_registry import SnapshotRegistry
 from libmuscle.manager.topology_store import TopologyStore
 from libmuscle.mcp.protocol import RequestType, ResponseType
@@ -56,7 +57,9 @@ def __init__(
             configuration: PartialConfiguration,
             instance_registry: InstanceRegistry,
             topology_store: TopologyStore,
-            snapshot_registry: SnapshotRegistry):
+            snapshot_registry: SnapshotRegistry,
+            run_dir: Optional[RunDir]
+            ) -> None:
         """Create an MMPRequestHandler.
 
         Args:
@@ -70,6 +73,7 @@ def __init__(
         self._instance_registry = instance_registry
         self._topology_store = topology_store
         self._snapshot_registry = snapshot_registry
+        self._run_dir = run_dir
         self._reference_time = datetime.now(timezone.utc)
         self._reference_timestamp = self._reference_time.timestamp()
 
@@ -286,15 +290,23 @@ def _get_checkpoint_info(self, instance_id: str) -> Any:
                 wallclock time of the start of the workflow.
             checkpoints (dict): Dictionary encdoing a ymmsl.Checkpoints object.
             resume_path (Optional[str]): Checkpoint filename to resume from.
+            snapshot_directory (Optional[str]): Directory to store instance
+                snapshots.
         """
         instance = Reference(instance_id)
         resume = None
         if instance in self._configuration.resume:
             resume = str(self._configuration.resume[instance])
+
+        snapshot_directory = None
+        if self._run_dir is not None:
+            snapshot_directory = str(self._run_dir.snapshot_dir(instance))
+
         return [ResponseType.SUCCESS.value,
                 self._reference_timestamp,
                 encode_checkpoints(self._configuration.checkpoints),
-                resume]
+                resume,
+                snapshot_directory]
 
 
 class MMPServer:
@@ -310,7 +322,8 @@ def __init__(
             configuration: PartialConfiguration,
             instance_registry: InstanceRegistry,
             topology_store: TopologyStore,
-            snapshot_registry: SnapshotRegistry
+            snapshot_registry: SnapshotRegistry,
+            run_dir: Optional[RunDir]
             ) -> None:
         """Create an MMPServer.
 
@@ -329,7 +342,7 @@ def __init__(
         """
         self._handler = MMPRequestHandler(
                 logger, configuration, instance_registry, topology_store,
-                snapshot_registry)
+                snapshot_registry, run_dir)
         try:
             self._server = TcpTransportServer(self._handler, 9000)
         except OSError as e:
diff --git a/libmuscle/python/libmuscle/manager/run_dir.py b/libmuscle/python/libmuscle/manager/run_dir.py
index 6a50c2fe..186d32e8 100644
--- a/libmuscle/python/libmuscle/manager/run_dir.py
+++ b/libmuscle/python/libmuscle/manager/run_dir.py
@@ -75,5 +75,5 @@ def snapshot_dir(self, name: Optional[Reference] = None) -> Path:
             path = self.path / 'snapshots'
         else:
             path = self.instance_dir(name) / 'snapshots'
-        path.mkdir(exist_ok=True)
+        path.mkdir(parents=True, exist_ok=True)
         return path
diff --git a/libmuscle/python/libmuscle/manager/test/conftest.py b/libmuscle/python/libmuscle/manager/test/conftest.py
index 24772bda..992a3950 100644
--- a/libmuscle/python/libmuscle/manager/test/conftest.py
+++ b/libmuscle/python/libmuscle/manager/test/conftest.py
@@ -6,6 +6,7 @@
 from libmuscle.manager.instance_registry import InstanceRegistry
 from libmuscle.manager.logger import Logger
 from libmuscle.manager.mmp_server import MMPRequestHandler
+from libmuscle.manager.run_dir import RunDir
 from libmuscle.manager.snapshot_registry import SnapshotRegistry
 from libmuscle.manager.topology_store import TopologyStore
 
@@ -53,7 +54,7 @@ def mmp_request_handler(
         snapshot_registry):
     return MMPRequestHandler(
             logger, mmp_configuration, instance_registry, topology_store,
-            snapshot_registry)
+            snapshot_registry, None)
 
 
 @pytest.fixture
@@ -73,7 +74,7 @@ def registered_mmp_request_handler(
         snapshot_registry):
     return MMPRequestHandler(
             logger, mmp_configuration, loaded_instance_registry, topology_store,
-            snapshot_registry)
+            snapshot_registry, None)
 
 
 @pytest.fixture
@@ -126,7 +127,8 @@ def loaded_instance_registry2():
 @pytest.fixture
 def registered_mmp_request_handler2(
         logger, mmp_configuration, loaded_instance_registry2, topology_store2,
-        snapshot_registry2):
+        snapshot_registry2, tmp_path):
     return MMPRequestHandler(
             logger, mmp_configuration,
-            loaded_instance_registry2, topology_store2, snapshot_registry2)
+            loaded_instance_registry2, topology_store2, snapshot_registry2,
+            RunDir(tmp_path))
diff --git a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
index 89de4068..876ae197 100644
--- a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
+++ b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
@@ -17,7 +17,7 @@ def test_create_servicer(logger, mmp_configuration, instance_registry,
                          topology_store, snapshot_registry):
     MMPRequestHandler(
             logger, mmp_configuration, instance_registry, topology_store,
-            snapshot_registry)
+            snapshot_registry, None)
 
 
 def test_log_message(mmp_request_handler, caplog):
@@ -109,7 +109,7 @@ def test_get_checkpoint_info(mmp_configuration, mmp_request_handler):
     decoded_result = msgpack.unpackb(result, raw=False)
 
     assert decoded_result[0] == ResponseType.SUCCESS.value
-    timestamp, checkpoints, resume = decoded_result[1:]
+    timestamp, checkpoints, resume, snapshot_directory = decoded_result[1:]
 
     ref_time = datetime.fromtimestamp(timestamp, tz=timezone.utc)
     assert ref_time == mmp_request_handler._reference_time
@@ -126,6 +126,21 @@ def test_get_checkpoint_info(mmp_configuration, mmp_request_handler):
     assert resume is not None
     assert Path(resume) == resume_path
 
+    assert snapshot_directory is None
+
+
+def test_get_checkpoint_info2(registered_mmp_request_handler2, tmp_path):
+    request = [RequestType.GET_CHECKPOINT_INFO.value, 'test_instance']
+    encoded_request = msgpack.packb(request, use_bin_type=True)
+
+    result = registered_mmp_request_handler2.handle_request(encoded_request)
+    decoded_result = msgpack.unpackb(result, raw=False)
+
+    assert decoded_result[0] == ResponseType.SUCCESS.value
+    snapshot_directory = decoded_result[4]
+    assert snapshot_directory == (
+            str(tmp_path) + '/instances/test_instance/snapshots')
+
 
 def test_double_register_instance(mmp_request_handler):
     request = [
diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py
index 37effdca..f40ea48b 100644
--- a/libmuscle/python/libmuscle/mmp_client.py
+++ b/libmuscle/python/libmuscle/mmp_client.py
@@ -22,6 +22,9 @@
 PEER_INTERVAL_MIN = 5.0
 PEER_INTERVAL_MAX = 10.0
 
+_CheckpointInfoType = Tuple[
+        datetime, Checkpoints, Optional[Path], Optional[Path]]
+
 
 def encode_operator(op: Operator) -> str:
     """Convert an Operator to a MsgPack-compatible value."""
@@ -63,8 +66,9 @@ def decode_checkpoint_rule(rule: Dict[str, Any]) -> CheckpointRule:
 def decode_checkpoint_info(
         reference_timestamp: float,
         checkpoints_dict: Dict[str, Any],
-        resume: Optional[str]
-        ) -> Tuple[datetime, Checkpoints, Optional[Path]]:
+        resume: Optional[str],
+        snapshot_dir: Optional[str]
+        ) -> _CheckpointInfoType:
     """Decode checkpoint info from a MsgPack-compatible value.
 
     Args:
@@ -72,11 +76,13 @@ def decode_checkpoint_info(
             wallclock_time = 0
         checkpoints_dict: dictionary of checkpoint definitions
         resume: optional string indicating resume path
+        snapshot_dir: optional string indicating path to store snapshots in
 
     Returns:
         wallclock_time_reference: UTC time where wallclock_time = 0
         checkpoints: checkpoint configuration
         resume: path to the resume snapshot
+        snapshot_dir: optional path to store snapshots in
     """
     ref_time = datetime.fromtimestamp(reference_timestamp, tz=timezone.utc)
     checkpoints = Checkpoints(
@@ -86,7 +92,8 @@ def decode_checkpoint_info(
             simulation_time=[decode_checkpoint_rule(rule)
                              for rule in checkpoints_dict["simulation_time"]])
     resume_path = None if resume is None else Path(resume)
-    return (ref_time, checkpoints, resume_path)
+    snapshot_path = None if snapshot_dir is None else Path(snapshot_dir)
+    return (ref_time, checkpoints, resume_path, snapshot_path)
 
 
 class MMPClient():
@@ -162,14 +169,14 @@ def get_settings(self) -> Settings:
         response = self._call_manager(request)
         return Settings(response[1])
 
-    def get_checkpoint_info(self, name: Reference
-                            ) -> Tuple[datetime, Checkpoints, Optional[Path]]:
+    def get_checkpoint_info(self, name: Reference) -> _CheckpointInfoType:
         """Get the checkpoint info from the manager.
 
         Returns:
             wallclock_time_reference: UTC time where wallclock_time = 0
             checkpoints: checkpoint configuration
             resume: path to the resume snapshot
+            snapshot_directory: path to store snapshots
         """
         request = [RequestType.GET_CHECKPOINT_INFO.value, str(name)]
         response = self._call_manager(request)
diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py
index 54059375..0678c96c 100644
--- a/libmuscle/python/libmuscle/snapshot_manager.py
+++ b/libmuscle/python/libmuscle/snapshot_manager.py
@@ -50,7 +50,6 @@ def __init__(self,
         self._first_reuse = True
         self._trigger_manager = TriggerManager()
         self._resume_from_snapshot = None   # type: Optional[Snapshot]
-        self._snapshot_directory = None     # type: Optional[Path]
         self._next_snapshot_num = 1
 
     def get_checkpoint_info(self) -> None:
@@ -62,7 +61,8 @@ def get_checkpoint_info(self) -> None:
     def _set_checkpoint_info(self,
                              utc_reference: datetime,
                              checkpoints: Checkpoints,
-                             resume: Optional[Path]) -> None:
+                             resume: Optional[Path],
+                             snapshot_directory: Optional[Path]) -> None:
         """Apply checkpoint info received from the manager.
 
         Args:
@@ -71,6 +71,7 @@ def _set_checkpoint_info(self,
             resume: previous snapshot to resume from (or None if not resuming)
         """
         self._trigger_manager.set_checkpoint_info(utc_reference, checkpoints)
+        self._snapshot_directory = snapshot_directory or Path.cwd()
         if resume is not None:
             snapshot = self.load_snapshot_from_file(resume)
             if snapshot.message is not None:
@@ -82,7 +83,7 @@ def _set_checkpoint_info(self,
             self._communicator.restore_message_counts(
                 snapshot.port_message_counts)
 
-    def reuse_instance(self, snapshot_directory: Optional[Path],
+    def reuse_instance(self,
                        do_reuse: bool, f_init_max_timestamp: Optional[float]
                        ) -> None:
         """Callback on Instance.reuse_instance
@@ -106,8 +107,6 @@ def reuse_instance(self, snapshot_directory: Optional[Path],
 
         self._trigger_manager.reuse_instance()
 
-        self._snapshot_directory = snapshot_directory
-
         if self._first_reuse:
             self._first_reuse = False
         else:
@@ -246,10 +245,6 @@ def __store_snapshot(self, snapshot: Snapshot) -> Path:
             Path where the snapshot is stored
         """
         _logger.debug(f'Saving snapshot to {self._snapshot_directory}')
-        if self._snapshot_directory is None:
-            raise RuntimeError('Unknown snapshot directory. Did you try to'
-                               ' save a snapshot before entering the reuse'
-                               ' loop?')
         for _ in range(_MAX_FILE_EXISTS_CHECK):
             # Expectation is that muscle_snapshot_directory is empty initially
             # and we succeed in the first loop. Still wrapping in a for-loop
diff --git a/libmuscle/python/libmuscle/test/test_instance.py b/libmuscle/python/libmuscle/test/test_instance.py
index 54044a00..17e3e3e0 100644
--- a/libmuscle/python/libmuscle/test/test_instance.py
+++ b/libmuscle/python/libmuscle/test/test_instance.py
@@ -37,7 +37,7 @@ def sys_argv_instance() -> Generator[None, None, None]:
 
 
 @pytest.fixture
-def instance(sys_argv_instance):
+def instance(sys_argv_instance, tmp_path):
     with patch('libmuscle.instance.MMPClient') as mmp_client, \
          patch('libmuscle.instance.Communicator') as comm_type:
         communicator = MagicMock()
@@ -49,7 +49,8 @@ def instance(sys_argv_instance):
 
         mmp_client_object = MagicMock()
         mmp_client_object.request_peers.return_value = (None, None, None)
-        checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None)
+        checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None,
+                           tmp_path)
         mmp_client_object.get_checkpoint_info.return_value = checkpoint_info
         mmp_client.return_value = mmp_client_object
 
@@ -62,12 +63,13 @@ def instance(sys_argv_instance):
 
 
 @pytest.fixture
-def instance2(sys_argv_instance):
+def instance2(sys_argv_instance, tmp_path):
     with patch('libmuscle.instance.MMPClient') as mmp_client, \
          patch('libmuscle.instance.Communicator'):
         mmp_client_object = MagicMock()
         mmp_client_object.request_peers.return_value = (None, None, None)
-        checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None)
+        checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None,
+                           tmp_path)
         mmp_client_object.get_checkpoint_info.return_value = checkpoint_info
         mmp_client.return_value = mmp_client_object
         instance = Instance({
@@ -77,12 +79,13 @@ def instance2(sys_argv_instance):
 
 
 def test_create_instance(
-        sys_argv_instance, log_file_in_tmpdir, sys_argv_manager):
+        sys_argv_instance, log_file_in_tmpdir, sys_argv_manager, tmp_path):
     with patch('libmuscle.instance.MMPClient') as mmp_client, \
          patch('libmuscle.instance.Communicator') as comm_type:
         mmp_client_object = MagicMock()
         mmp_client_object.request_peers.return_value = (None, None, None)
-        checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None)
+        checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None,
+                           tmp_path)
         mmp_client_object.get_checkpoint_info.return_value = checkpoint_info
         mmp_client.return_value = mmp_client_object
         ports = {
diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
index 16f81ce3..6325cb0d 100644
--- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py
+++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
@@ -22,10 +22,10 @@ def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path
             ImplementationState.STATEFUL)
 
     snapshot_manager._set_checkpoint_info(
-            datetime.now(timezone.utc), Checkpoints(), None)
+            datetime.now(timezone.utc), Checkpoints(), None, tmp_path)
 
     assert not snapshot_manager.resuming()
-    snapshot_manager.reuse_instance(tmp_path, True, None)
+    snapshot_manager.reuse_instance(True, None)
     assert not snapshot_manager.resuming()
     assert not snapshot_manager.should_save_snapshot(1)
     assert not snapshot_manager.should_save_snapshot(5000)
@@ -49,10 +49,10 @@ def test_save_load_snapshot(tmp_path: Path) -> None:
 
     checkpoints = Checkpoints(simulation_time=[CheckpointRangeRule(every=1)])
     snapshot_manager._set_checkpoint_info(
-            datetime.now(timezone.utc), checkpoints, None)
+            datetime.now(timezone.utc), checkpoints, None, tmp_path)
 
     assert not snapshot_manager.resuming()
-    snapshot_manager.reuse_instance(tmp_path, True, None)
+    snapshot_manager.reuse_instance(True, None)
     with pytest.raises(RuntimeError):
         snapshot_manager.load_snapshot()
 
@@ -79,11 +79,11 @@ def test_save_load_snapshot(tmp_path: Path) -> None:
             instance_id, manager, communicator, ImplementationState.STATEFUL)
 
     snapshot_manager2._set_checkpoint_info(
-            datetime.now(timezone.utc), checkpoints, snapshot_path)
+            datetime.now(timezone.utc), checkpoints, snapshot_path, tmp_path)
     communicator.restore_message_counts.assert_called_with(port_message_counts)
 
     assert snapshot_manager2.resuming()
-    snapshot_manager2.reuse_instance(tmp_path, True, None)
+    snapshot_manager2.reuse_instance(True, None)
     assert snapshot_manager2.resuming()
     msg = snapshot_manager2.load_snapshot()
     assert msg.timestamp == 0.2
@@ -109,7 +109,7 @@ def test_save_load_snapshot(tmp_path: Path) -> None:
     assert snapshot_path.name == 'test-1_2.pack'
 
     assert snapshot_manager2.resuming()
-    snapshot_manager2.reuse_instance(tmp_path, True, None)
+    snapshot_manager2.reuse_instance(True, None)
     assert not snapshot_manager2.resuming()
 
 
@@ -125,11 +125,11 @@ def test_save_load_implicit_snapshot(tmp_path: Path) -> None:
 
     checkpoints = Checkpoints(simulation_time=[CheckpointRangeRule(every=1)])
     snapshot_manager._set_checkpoint_info(
-            datetime.now(timezone.utc), checkpoints, None)
+            datetime.now(timezone.utc), checkpoints, None, tmp_path)
 
     assert not snapshot_manager.resuming()
-    snapshot_manager.reuse_instance(tmp_path, True, None)
-    snapshot_manager.reuse_instance(tmp_path, True, 1.5)
+    snapshot_manager.reuse_instance(True, None)
+    snapshot_manager.reuse_instance(True, 1.5)
     manager.submit_snapshot_metadata.assert_called_once()
     instance, metadata = manager.submit_snapshot_metadata.call_args[0]
     assert instance == instance_id
@@ -141,11 +141,11 @@ def test_save_load_implicit_snapshot(tmp_path: Path) -> None:
             instance_id, manager, communicator, ImplementationState.STATELESS)
 
     snapshot_manager2._set_checkpoint_info(
-            datetime.now(timezone.utc), checkpoints, snapshot_path)
+            datetime.now(timezone.utc), checkpoints, snapshot_path, tmp_path)
     communicator.restore_message_counts.assert_called_with(port_message_counts)
 
     assert not snapshot_manager2.resuming()
-    snapshot_manager2.reuse_instance(tmp_path, True, 1.5)
+    snapshot_manager2.reuse_instance(True, 1.5)
     assert not snapshot_manager2.resuming()
-    snapshot_manager2.reuse_instance(tmp_path, True, 2.5)
+    snapshot_manager2.reuse_instance(True, 2.5)
     manager.submit_snapshot_metadata.assert_called_once()

From fa648e8b68c05c94e71f92c75c1e73e75408adad Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 28 Nov 2022 11:36:05 +0100
Subject: [PATCH 098/183] Bugfixes, new test, create snapshot at resume

- Add test_snapshot_dispatch which is a pure serial workflow
- Each actor will also store the snapshot it is resumed from
---
 integration_test/conftest.py                  |   9 +-
 integration_test/test_snapshot_dispatch.py    | 124 ++++++++++++++++++
 integration_test/test_snapshot_macro_micro.py |  69 ++++------
 .../python/libmuscle/checkpoint_triggers.py   |   4 +-
 libmuscle/python/libmuscle/instance.py        |   5 +-
 libmuscle/python/libmuscle/mmp_client.py      |   2 +-
 .../python/libmuscle/snapshot_manager.py      |   6 +-
 .../libmuscle/test/test_snapshot_manager.py   |   4 +-
 8 files changed, 172 insertions(+), 51 deletions(-)
 create mode 100644 integration_test/test_snapshot_dispatch.py

diff --git a/integration_test/conftest.py b/integration_test/conftest.py
index ad59842a..a6c70b1e 100644
--- a/integration_test/conftest.py
+++ b/integration_test/conftest.py
@@ -26,6 +26,12 @@ def yatiml_log_warning():
     yatiml.logger.setLevel(logging.WARNING)
 
 
+def ls_snapshots(run_dir, instance=None):
+    """List all snapshots of the instance or workflow"""
+    return sorted(run_dir.snapshot_dir(instance).iterdir(),
+                  key=lambda path: tuple(map(int, path.stem.split("_")[1:])))
+
+
 def start_mmp_server(control_pipe, ymmsl_doc, run_dir):
     control_pipe[0].close()
     manager = Manager(ymmsl_doc, run_dir)
@@ -108,7 +114,8 @@ def run_manager_with_actors(
         for instance_name, callable in python_actors.items():
             proc = mp.Process(
                     target=_python_wrapper,
-                    args=(instance_name, env['MUSCLE_MANAGER'], callable))
+                    args=(instance_name, env['MUSCLE_MANAGER'], callable),
+                    name=instance_name)
             proc.start()
             python_processes.append(proc)
 
diff --git a/integration_test/test_snapshot_dispatch.py b/integration_test/test_snapshot_dispatch.py
new file mode 100644
index 00000000..2b9d3b57
--- /dev/null
+++ b/integration_test/test_snapshot_dispatch.py
@@ -0,0 +1,124 @@
+import pytest
+from ymmsl import ImplementationState, Operator, load, dump
+
+from libmuscle import Instance, Message
+from libmuscle.manager.run_dir import RunDir
+
+from .conftest import run_manager_with_actors, ls_snapshots
+
+
+_LOG_LEVEL = 'INFO'  # set to DEBUG for additional debug info
+
+
+def component():
+    instance = Instance({
+            Operator.F_INIT: ['f_i'],
+            Operator.O_F: ['o_f']})
+
+    while instance.reuse_instance():
+        dt = instance.get_setting('dt', 'float')
+        t_max = instance.get_setting('t_max', 'float')
+
+        if instance.resuming():
+            msg = instance.load_snapshot()
+            t_cur = msg.timestamp
+            i, t_stop = msg.data
+
+        if instance.should_init():
+            msg = instance.receive('f_i', default=Message(0, None, 0))
+            t_cur = msg.timestamp
+            i = msg.data
+            t_stop = t_cur + t_max
+
+        while t_cur < t_stop:
+            # faux time-integration for testing snapshots
+            t_cur += dt
+
+            if instance.should_save_snapshot(t_cur):
+                instance.save_snapshot(Message(t_cur, None, [i, t_stop]))
+
+        instance.send('o_f', Message(t_cur, None, i))
+
+        if instance.should_save_final_snapshot():
+            instance.save_final_snapshot(Message(t_cur, None, [i, t_stop]))
+
+
+def stateless_component():
+    instance = Instance({
+            Operator.F_INIT: ['f_i'],
+            Operator.O_F: ['o_f']},
+            stateful=ImplementationState.STATELESS)
+
+    while instance.reuse_instance():
+        dt = instance.get_setting('dt', 'float')
+        t_max = instance.get_setting('t_max', 'float')
+
+        msg = instance.receive('f_i', default=Message(0, None, 0))
+        t_cur = msg.timestamp
+        i = msg.data
+        t_stop = t_cur + t_max
+
+        while t_cur < t_stop:
+            # faux time-integration for testing snapshots
+            t_cur += dt
+
+        instance.send('o_f', Message(t_cur, None, i))
+
+
+@pytest.fixture
+def dispatch_config():
+    return load(f"""ymmsl_version: v0.1
+model:
+  name: test_snapshot
+  components:
+    comp1: component
+    comp2: component
+    comp3: component
+    comp4: component
+    comp5: component
+  conduits:
+    comp1.o_f: comp2.f_i
+    comp2.o_f: comp3.f_i
+    comp3.o_f: comp4.f_i
+    comp4.o_f: comp5.f_i
+settings:
+  dt: 0.1234
+  t_max: 2.0
+  muscle_remote_log_level: {_LOG_LEVEL}
+checkpoints:
+  at_end: true
+  simulation_time:
+  - every: 2.5
+  - at:
+    - 2.3
+    - 2.8""")
+
+
+def test_snapshot_dispatch(tmp_path, dispatch_config):
+    actors = {f'comp{i + 1}': component for i in range(5)}
+    run_dir1 = RunDir(tmp_path / 'run1')
+    run_manager_with_actors(
+            dump(dispatch_config), run_dir1.path, python_actors=actors)
+
+    assert len(ls_snapshots(run_dir1, 'comp1')) == 2  # t=0, at_end
+    assert len(ls_snapshots(run_dir1, 'comp2')) == 5  # t=0, 2.5, 2.3, 2.8, at_end
+    assert len(ls_snapshots(run_dir1, 'comp3')) == 3  # t=2.5, 5, at_end
+    assert len(ls_snapshots(run_dir1, 'comp4')) == 3  # t=5, 7.5, at_end
+    assert len(ls_snapshots(run_dir1, 'comp5')) == 3  # t=7.5, 10, at_end
+
+    snapshots_ymmsl = ls_snapshots(run_dir1)
+    snapshot_docs = list(map(load, snapshots_ymmsl))
+    assert len(snapshot_docs) == 16
+
+    # resume from the snapshots taken at t>=2.3
+    run_dir2 = RunDir(tmp_path / 'run2')
+    dispatch_config.update(snapshot_docs[3])  # add resume info
+    run_manager_with_actors(
+            dump(dispatch_config), run_dir2.path, python_actors=actors)
+
+    assert len(ls_snapshots(run_dir2, 'comp1')) == 1  # resume
+    assert len(ls_snapshots(run_dir2, 'comp2')) == 4  # resume, t=2.5, 2.8, at_end
+    assert len(ls_snapshots(run_dir2, 'comp3')) == 3  # t=2.5, 5, at_end
+    assert len(ls_snapshots(run_dir2, 'comp4')) == 3  # t=5, 7.5, at_end
+    assert len(ls_snapshots(run_dir2, 'comp5')) == 3  # t=7.5, 10, at_end
+    assert len(ls_snapshots(run_dir2)) == 13
diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py
index 0420df98..f8b11cb4 100644
--- a/integration_test/test_snapshot_macro_micro.py
+++ b/integration_test/test_snapshot_macro_micro.py
@@ -1,11 +1,11 @@
-from .conftest import run_manager_with_actors
-
 import pytest
 from ymmsl import ImplementationState, Operator, load, dump
 
 from libmuscle import Instance, Message
 from libmuscle.manager.run_dir import RunDir
 
+from .conftest import run_manager_with_actors, ls_snapshots
+
 
 _LOG_LEVEL = 'INFO'  # set to DEBUG for additional debug info
 
@@ -202,13 +202,11 @@ def test_snapshot_macro_micro(tmp_path, base_config):
     run_manager_with_actors(
             dump(base_config), run_dir1.path, python_actors=actors)
 
-    # Note: sorted only works because we have fewer than 10 snapshots, otherwise
-    # _10 would be sorted right after _1
-    macro_snapshots = sorted(run_dir1.snapshot_dir('macro').iterdir())
+    macro_snapshots = ls_snapshots(run_dir1, 'macro')
     assert len(macro_snapshots) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
-    micro_snapshots = sorted(run_dir1.snapshot_dir('micro').iterdir())
+    micro_snapshots = ls_snapshots(run_dir1, 'micro')
     assert len(micro_snapshots) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
-    snapshots_ymmsl = sorted(run_dir1.snapshot_dir().iterdir())
+    snapshots_ymmsl = ls_snapshots(run_dir1)
     snapshot_docs = list(map(load, snapshots_ymmsl))
     assert len(snapshot_docs) == 7
     assert snapshot_docs[0].resume['macro'] == macro_snapshots[0]
@@ -225,12 +223,9 @@ def test_snapshot_macro_micro(tmp_path, base_config):
     run_manager_with_actors(
             dump(base_config), run_dir2.path, python_actors=actors)
 
-    macro_snapshots = sorted(run_dir2.snapshot_dir('macro').iterdir())
-    assert len(macro_snapshots) == 2  # 1.6, final
-    micro_snapshots = sorted(run_dir2.snapshot_dir('micro').iterdir())
-    assert len(micro_snapshots) == 2  # 1.6, final
-    snapshots_ymmsl = sorted(run_dir2.snapshot_dir().iterdir())
-    assert len(snapshots_ymmsl) == 2
+    assert len(ls_snapshots(run_dir2, 'macro')) == 3  # resume, 1.6, final
+    assert len(ls_snapshots(run_dir2, 'micro')) == 3  # resume, 1.6, final
+    assert len(ls_snapshots(run_dir2)) == 3
 
     # resume from the first workflow snapshot (this restarts macro from scratch)
     run_dir3 = RunDir(tmp_path / 'run3')
@@ -247,13 +242,9 @@ def test_snapshot_macro_stateless_micro(tmp_path, base_config):
     run_manager_with_actors(
             dump(base_config), run_dir1.path, python_actors=actors)
 
-    # Note: sorted only works because we have fewer than 10 snapshots, otherwise
-    # _10 would be sorted right after _1
-    macro_snapshots = sorted(run_dir1.snapshot_dir('macro').iterdir())
-    assert len(macro_snapshots) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
-    micro_snapshots = sorted(run_dir1.snapshot_dir('micro').iterdir())
-    assert len(micro_snapshots) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
-    snapshots_ymmsl = sorted(run_dir1.snapshot_dir().iterdir())
+    assert len(ls_snapshots(run_dir1, 'macro')) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
+    assert len(ls_snapshots(run_dir1, 'micro')) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
+    snapshots_ymmsl = ls_snapshots(run_dir1)
     snapshot_docs = list(map(load, snapshots_ymmsl))
     assert len(snapshot_docs) == 6
 
@@ -263,12 +254,9 @@ def test_snapshot_macro_stateless_micro(tmp_path, base_config):
     run_manager_with_actors(
             dump(base_config), run_dir2.path, python_actors=actors)
 
-    macro_snapshots = sorted(run_dir2.snapshot_dir('macro').iterdir())
-    assert len(macro_snapshots) == 2  # 1.6, final
-    micro_snapshots = sorted(run_dir2.snapshot_dir('micro').iterdir())
-    assert len(micro_snapshots) == 3  # 1.2, 1.6, final
-    snapshots_ymmsl = sorted(run_dir2.snapshot_dir().iterdir())
-    assert len(snapshots_ymmsl) == 2
+    assert len(ls_snapshots(run_dir2, 'macro')) == 3  # resume, 1.6, final
+    assert len(ls_snapshots(run_dir2, 'micro')) == 4  # resume, 1.2, 1.6, final
+    assert len(ls_snapshots(run_dir2)) == 3
 
 
 def test_snapshot_macro_vector_micro(tmp_path, base_config):
@@ -279,13 +267,10 @@ def test_snapshot_macro_vector_micro(tmp_path, base_config):
     run_manager_with_actors(
             dump(base_config), run_dir1.path, python_actors=actors)
 
-    macro_snapshots = sorted(run_dir1.snapshot_dir('macro').iterdir())
-    assert len(macro_snapshots) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
-    micro_snapshots = sorted(run_dir1.snapshot_dir('micro[0]').iterdir())
-    assert len(micro_snapshots) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
-    micro_snapshots = sorted(run_dir1.snapshot_dir('micro[1]').iterdir())
-    assert len(micro_snapshots) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
-    snapshots_ymmsl = sorted(run_dir1.snapshot_dir().iterdir())
+    assert len(ls_snapshots(run_dir1, 'macro')) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
+    assert len(ls_snapshots(run_dir1, 'micro[0]')) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
+    assert len(ls_snapshots(run_dir1, 'micro[1]')) == 6  # 0, 0.4, 0.8, 1.2, 1.6, final
+    snapshots_ymmsl = ls_snapshots(run_dir1)
     assert len(snapshots_ymmsl) == 8
 
     run_dir2 = RunDir(tmp_path / 'run2')
@@ -293,14 +278,10 @@ def test_snapshot_macro_vector_micro(tmp_path, base_config):
     run_manager_with_actors(
             dump(base_config), run_dir2.path, python_actors=actors)
 
-    macro_snapshots = sorted(run_dir2.snapshot_dir('macro').iterdir())
-    assert len(macro_snapshots) == 2  # 1.6, final
-    micro_snapshots = sorted(run_dir2.snapshot_dir('micro[0]').iterdir())
-    assert len(micro_snapshots) == 2  # 1.6, final
-    micro_snapshots = sorted(run_dir2.snapshot_dir('micro[1]').iterdir())
-    assert len(micro_snapshots) == 2  # 1.6, final
-    snapshots_ymmsl = sorted(run_dir2.snapshot_dir().iterdir())
-    assert len(snapshots_ymmsl) == 2
+    assert len(ls_snapshots(run_dir2, 'macro')) == 3  # resume, 1.6, final
+    assert len(ls_snapshots(run_dir2, 'micro[0]')) == 3  # resume, 1.6, final
+    assert len(ls_snapshots(run_dir2, 'micro[1]')) == 3  # resume, 1.6, final
+    assert len(ls_snapshots(run_dir2)) == 3
 
 
 def test_snapshot_macro_transformer_micro(tmp_path, config_with_transformer):
@@ -311,7 +292,7 @@ def test_snapshot_macro_transformer_micro(tmp_path, config_with_transformer):
     run_manager_with_actors(
             dump(config_with_transformer), run_dir1.path, python_actors=actors)
 
-    snapshots_ymmsl = sorted(run_dir1.snapshot_dir().iterdir())
+    snapshots_ymmsl = ls_snapshots(run_dir1)
     assert len(snapshots_ymmsl) == 8
 
     # pick one to resume from
@@ -320,5 +301,5 @@ def test_snapshot_macro_transformer_micro(tmp_path, config_with_transformer):
     run_manager_with_actors(
             dump(config_with_transformer), run_dir2.path, python_actors=actors)
 
-    snapshots_ymmsl = sorted(run_dir2.snapshot_dir().iterdir())
-    assert len(snapshots_ymmsl) == 3
+    snapshots_ymmsl = ls_snapshots(run_dir2)
+    assert len(snapshots_ymmsl) == 6
diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py
index a33a785d..a4edf3be 100644
--- a/libmuscle/python/libmuscle/checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/checkpoint_triggers.py
@@ -297,8 +297,8 @@ def reuse_instance(self) -> None:
                 _checkpoint_error('You must call "should_save_final" exactly'
                                   ' once in the reuse loop of an instance that'
                                   ' supports checkpointing.')
-            self._should_save_final_called = False
-            self._saved_final_checkpoint = False
+        self._should_save_final_called = False
+        self._saved_final_checkpoint = False
 
     def update_checkpoints(self, timestamp: float, final: bool) -> None:
         """Update last and next checkpoint times when a snapshot is made.
diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index 48bb2905..a0984fa2 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -670,6 +670,8 @@ def __check_reuse_instance(self, apply_overlay: bool) -> bool:
         # receive something that was sent on the last go-around.
         # At least emit a warning.
         if self.should_init() or not self._first_run:
+            # self.should_init() might be False in first should_save_final(),
+            # but self._first_run is already updated by then
             self.__pre_receive_f_init(apply_overlay)
 
         self._set_local_log_level()
@@ -682,7 +684,8 @@ def __check_reuse_instance(self, apply_overlay: bool) -> bool:
         no_settings_in = not self._communicator.settings_in_connected()
 
         if f_init_not_connected and no_settings_in:
-            do_reuse = self._first_run
+            do_reuse = self._first_run and (not self.resuming() or
+                                            not self.should_init())
         else:
             for message in self._f_init_cache.values():
                 if isinstance(message.data, ClosePort):
diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py
index f40ea48b..188814ff 100644
--- a/libmuscle/python/libmuscle/mmp_client.py
+++ b/libmuscle/python/libmuscle/mmp_client.py
@@ -56,7 +56,7 @@ def encode_profile_event(event: ProfileEvent) -> Any:
 
 def decode_checkpoint_rule(rule: Dict[str, Any]) -> CheckpointRule:
     """Decode a checkpoint rule from a MsgPack-compatible value."""
-    if rule.keys() == {'in'}:
+    if rule.keys() == {'at'}:
         return CheckpointAtRule(**rule)
     if rule.keys() == {'start', 'stop', 'every'}:
         return CheckpointRangeRule(**rule)
diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py
index 0678c96c..0bd3de83 100644
--- a/libmuscle/python/libmuscle/snapshot_manager.py
+++ b/libmuscle/python/libmuscle/snapshot_manager.py
@@ -82,6 +82,10 @@ def _set_checkpoint_info(self,
                     snapshot.is_final_snapshot)
             self._communicator.restore_message_counts(
                 snapshot.port_message_counts)
+            # Store a copy of the snapshot in the current run directory
+            path = self.__store_snapshot(snapshot)
+            metadata = SnapshotMetadata.from_snapshot(snapshot, str(path))
+            self._manager.submit_snapshot_metadata(self._instance_id, metadata)
 
     def reuse_instance(self,
                        do_reuse: bool, f_init_max_timestamp: Optional[float]
@@ -202,7 +206,7 @@ def __save_snapshot(
         metadata = SnapshotMetadata.from_snapshot(snapshot, str(path))
         self._manager.submit_snapshot_metadata(self._instance_id, metadata)
 
-        timestamp = msg.timestamp if msg is not None else -1.0
+        timestamp = msg.timestamp if msg is not None else float('-inf')
         if final and f_init_max_timestamp is not None:
             # For final snapshots f_init_max_snapshot is the reference time (see
             # should_save_final_snapshot).
diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
index 6325cb0d..ffec4744 100644
--- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py
+++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
@@ -106,7 +106,7 @@ def test_save_load_snapshot(tmp_path: Path) -> None:
     assert metadata.is_final_snapshot
     snapshot_path = Path(metadata.snapshot_filename)
     assert snapshot_path.parent == tmp_path
-    assert snapshot_path.name == 'test-1_2.pack'
+    assert snapshot_path.name == 'test-1_3.pack'
 
     assert snapshot_manager2.resuming()
     snapshot_manager2.reuse_instance(True, None)
@@ -143,6 +143,8 @@ def test_save_load_implicit_snapshot(tmp_path: Path) -> None:
     snapshot_manager2._set_checkpoint_info(
             datetime.now(timezone.utc), checkpoints, snapshot_path, tmp_path)
     communicator.restore_message_counts.assert_called_with(port_message_counts)
+    manager.submit_snapshot_metadata.assert_called_once()
+    manager.submit_snapshot_metadata.reset_mock()
 
     assert not snapshot_manager2.resuming()
     snapshot_manager2.reuse_instance(True, 1.5)

From b2dcfae2b131a6dca4ef404fe426214c9798ca75 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 28 Nov 2022 14:02:01 +0100
Subject: [PATCH 099/183] Fix non-deterministic CI failures

---
 integration_test/test_snapshot_dispatch.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/integration_test/test_snapshot_dispatch.py b/integration_test/test_snapshot_dispatch.py
index 2b9d3b57..3c2c791b 100644
--- a/integration_test/test_snapshot_dispatch.py
+++ b/integration_test/test_snapshot_dispatch.py
@@ -108,7 +108,9 @@ def test_snapshot_dispatch(tmp_path, dispatch_config):
 
     snapshots_ymmsl = ls_snapshots(run_dir1)
     snapshot_docs = list(map(load, snapshots_ymmsl))
-    assert len(snapshot_docs) == 16
+    # More ymmsl restarts files may be possible, depending on the sequence of
+    # incoming SnapshotMetadata...
+    assert len(snapshot_docs) >= 16
 
     # resume from the snapshots taken at t>=2.3
     run_dir2 = RunDir(tmp_path / 'run2')
@@ -121,4 +123,6 @@ def test_snapshot_dispatch(tmp_path, dispatch_config):
     assert len(ls_snapshots(run_dir2, 'comp3')) == 3  # t=2.5, 5, at_end
     assert len(ls_snapshots(run_dir2, 'comp4')) == 3  # t=5, 7.5, at_end
     assert len(ls_snapshots(run_dir2, 'comp5')) == 3  # t=7.5, 10, at_end
-    assert len(ls_snapshots(run_dir2)) == 13
+    # More ymmsl restarts files may be possible, depending on the sequence of
+    # incoming SnapshotMetadata...
+    assert len(ls_snapshots(run_dir2)) >= 13

From 978dd6d125ff5d5a0136caef6524f59bb0f1a98c Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 28 Nov 2022 14:26:59 +0100
Subject: [PATCH 100/183] More checks to understand CI failures

---
 integration_test/test_snapshot_dispatch.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/integration_test/test_snapshot_dispatch.py b/integration_test/test_snapshot_dispatch.py
index 3c2c791b..495bcd55 100644
--- a/integration_test/test_snapshot_dispatch.py
+++ b/integration_test/test_snapshot_dispatch.py
@@ -96,6 +96,7 @@ def dispatch_config():
 
 def test_snapshot_dispatch(tmp_path, dispatch_config):
     actors = {f'comp{i + 1}': component for i in range(5)}
+    (tmp_path / 'run1').mkdir()
     run_dir1 = RunDir(tmp_path / 'run1')
     run_manager_with_actors(
             dump(dispatch_config), run_dir1.path, python_actors=actors)
@@ -108,13 +109,20 @@ def test_snapshot_dispatch(tmp_path, dispatch_config):
 
     snapshots_ymmsl = ls_snapshots(run_dir1)
     snapshot_docs = list(map(load, snapshots_ymmsl))
-    # More ymmsl restarts files may be possible, depending on the sequence of
-    # incoming SnapshotMetadata...
-    assert len(snapshot_docs) >= 16
+    assert len(snapshot_docs) == 16
 
     # resume from the snapshots taken at t>=2.3
+    (tmp_path / 'run2').mkdir()
     run_dir2 = RunDir(tmp_path / 'run2')
     dispatch_config.update(snapshot_docs[3])  # add resume info
+    # validate resume info
+    resume = snapshot_docs[3].resume
+    assert resume['comp1'] == ls_snapshots(run_dir1, 'comp1')[1]
+    assert resume['comp2'] == ls_snapshots(run_dir1, 'comp2')[1]
+    assert 'comp3' not in resume
+    assert 'comp4' not in resume
+    assert 'comp5' not in resume
+
     run_manager_with_actors(
             dump(dispatch_config), run_dir2.path, python_actors=actors)
 

From 023fabef5d78aca06420dc9c8a5cfc55430ca8f2 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 28 Nov 2022 14:32:53 +0100
Subject: [PATCH 101/183] Deterministic restart for dispatch test case

---
 integration_test/test_snapshot_dispatch.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/integration_test/test_snapshot_dispatch.py b/integration_test/test_snapshot_dispatch.py
index 495bcd55..106f6d3c 100644
--- a/integration_test/test_snapshot_dispatch.py
+++ b/integration_test/test_snapshot_dispatch.py
@@ -96,7 +96,6 @@ def dispatch_config():
 
 def test_snapshot_dispatch(tmp_path, dispatch_config):
     actors = {f'comp{i + 1}': component for i in range(5)}
-    (tmp_path / 'run1').mkdir()
     run_dir1 = RunDir(tmp_path / 'run1')
     run_manager_with_actors(
             dump(dispatch_config), run_dir1.path, python_actors=actors)
@@ -109,19 +108,15 @@ def test_snapshot_dispatch(tmp_path, dispatch_config):
 
     snapshots_ymmsl = ls_snapshots(run_dir1)
     snapshot_docs = list(map(load, snapshots_ymmsl))
+    # More ymmsl restarts files may be possible, depending on the sequence of
+    # incoming SnapshotMetadata...
     assert len(snapshot_docs) == 16
 
     # resume from the snapshots taken at t>=2.3
-    (tmp_path / 'run2').mkdir()
     run_dir2 = RunDir(tmp_path / 'run2')
-    dispatch_config.update(snapshot_docs[3])  # add resume info
-    # validate resume info
-    resume = snapshot_docs[3].resume
-    assert resume['comp1'] == ls_snapshots(run_dir1, 'comp1')[1]
-    assert resume['comp2'] == ls_snapshots(run_dir1, 'comp2')[1]
-    assert 'comp3' not in resume
-    assert 'comp4' not in resume
-    assert 'comp5' not in resume
+    dispatch_config.resume = {
+        'comp1': ls_snapshots(run_dir1, 'comp1')[1],
+        'comp2': ls_snapshots(run_dir1, 'comp2')[1]}
 
     run_manager_with_actors(
             dump(dispatch_config), run_dir2.path, python_actors=actors)

From 4675ba052d17ed0bd38b7073ef51d49e56e6d17f Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 28 Nov 2022 16:39:03 +0100
Subject: [PATCH 102/183] Snapshot tests for interact & scale bridge

---
 .../examples/python/interact_coupling.py      |  85 ++++++++-
 integration_test/test_snapshot_interact.py    | 164 ++++++++++++++++++
 2 files changed, 242 insertions(+), 7 deletions(-)
 create mode 100644 integration_test/test_snapshot_interact.py

diff --git a/docs/source/examples/python/interact_coupling.py b/docs/source/examples/python/interact_coupling.py
index ff9408c7..3df5e11e 100644
--- a/docs/source/examples/python/interact_coupling.py
+++ b/docs/source/examples/python/interact_coupling.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Any, Optional, Tuple
+from typing import Any, Optional, Tuple, Dict
 
 from libmuscle import Instance, Message
 from libmuscle.runner import run_simulation
@@ -129,7 +129,8 @@ class Peer:
     via the instance object.
     """
     def __init__(
-            self, instance: Instance, in_port: str, out_port: str) -> None:
+            self, instance: Instance, in_port: str, out_port: str,
+            resume_from_state: Any = None) -> None:
         """Create a Peer object.
 
         This also receives an initial message from the peer model, and
@@ -145,11 +146,20 @@ def __init__(
         self.out_port = out_port
         self.cache = DataCache()
 
-        msg = self.instance.receive(self.in_port)
-        self.cache.add_data(msg.timestamp, msg.data)
-        self.rcvd = msg.timestamp
-        self.to_send = msg.timestamp
-        self.next = msg.next_timestamp
+        if resume_from_state:
+            self.cache.t_cur = resume_from_state['cache.t_cur']
+            self.cache.data_cur = resume_from_state['cache.data_cur']
+            self.cache.t_next = resume_from_state['cache.t_next']
+            self.cache.data_next = resume_from_state['cache.data_next']
+            self.rcvd = resume_from_state['rcvd']
+            self.to_send = resume_from_state['to_send']
+            self.next = resume_from_state['next']
+        else:
+            msg = self.instance.receive(self.in_port)
+            self.cache.add_data(msg.timestamp, msg.data)
+            self.rcvd = msg.timestamp
+            self.to_send = msg.timestamp
+            self.next = msg.next_timestamp
 
     def done(self) -> bool:
         """Return whether we are done commmunicating with this peer."""
@@ -200,6 +210,17 @@ def send(self, t: float, data: Any) -> None:
         self.instance.send(self.out_port, Message(t, self.next, data))
         self.to_send = self.next
 
+    def get_state(self) -> Dict[str, Any]:
+        """Return the current state of this object as a MUSCLE-serializable dict
+        """
+        return {'cache.t_cur': self.cache.t_cur,
+                'cache.data_cur': self.cache.data_cur,
+                'cache.t_next': self.cache.t_next,
+                'cache.data_next': self.cache.data_next,
+                'rcvd': self.rcvd,
+                'to_send': self.to_send,
+                'next': self.next}
+
 
 def temporal_coupler() -> None:
     """Model component connecting two scale-overlapping submodels.
@@ -241,6 +262,56 @@ def temporal_coupler() -> None:
                 b.send(t, data)
 
 
+def checkpointing_temporal_coupler() -> None:
+    """Model component connecting two scale-overlapping submodels.
+
+    This component sits in between two scale-overlapping submodels
+    running at different (and potentially variable) timesteps and
+    ensures that each of these peers receives a message whenever it
+    expects one, and can send a message whenever it expects to do so.
+
+    This function extends :func:`temporal_coupler` with checkpointing
+    capabilities.
+    """
+    instance = Instance({
+        Operator.O_I: ['a_out', 'b_out'],
+        Operator.S: ['a_in', 'b_in']})
+
+    while instance.reuse_instance():
+        if instance.resuming():
+            state = instance.load_snapshot().data
+            if state is not None:
+                a = Peer(instance, 'a_in', 'a_out', state['a'])
+                b = Peer(instance, 'b_in', 'b_out', state['b'])
+
+        if instance.should_init():
+            # Receive initial messages and initialise state
+            a = Peer(instance, 'a_in', 'a_out')
+            b = Peer(instance, 'b_in', 'b_out')
+
+        # Send and receive as needed
+        while not a.done() or not b.done():
+            if a.can_receive():
+                a.receive()
+            elif b.can_receive():
+                b.receive()
+            elif a.can_send(b.rcvd, b.next):
+                t, data = b.cache.get_data(a.to_send)
+                a.send(t, data)
+            elif b.can_send(a.rcvd, a.next):
+                t, data = a.cache.get_data(b.to_send)
+                b.send(t, data)
+
+            t_cur = min(a.rcvd, b.rcvd)
+            if instance.should_save_snapshot(t_cur):
+                instance.save_snapshot(Message(
+                        t_cur, None, {'a': a.get_state(), 'b': b.get_state()}))
+
+        t_cur = min(a.rcvd, b.rcvd)
+        if instance.should_save_final_snapshot():
+            instance.save_final_snapshot(Message(t_cur, None, None))
+
+
 if __name__ == '__main__':
     logging.basicConfig()
     logging.getLogger().setLevel(logging.INFO)
diff --git a/integration_test/test_snapshot_interact.py b/integration_test/test_snapshot_interact.py
new file mode 100644
index 00000000..5492f9e2
--- /dev/null
+++ b/integration_test/test_snapshot_interact.py
@@ -0,0 +1,164 @@
+import logging
+import sys
+from pathlib import Path
+
+import pytest
+from ymmsl import Operator, load, dump
+
+from libmuscle import Instance, Message
+from libmuscle.manager.run_dir import RunDir
+
+from .conftest import run_manager_with_actors, ls_snapshots
+
+# Make interact_coupling.py available (from docs/sources/examples)
+sys.path.append(str(
+        Path(__file__).parents[1] / 'docs' / 'source' / 'examples' / 'python'))
+import interact_coupling  # noqa
+
+_LOG_LEVEL = 'INFO'  # set to DEBUG for additional debug info
+
+
+def component():
+    instance = Instance({
+            Operator.O_I: ['o_i'],
+            Operator.S: ['s']})
+
+    while instance.reuse_instance():
+        t0 = instance.get_setting('t0', 'float')
+        dt = instance.get_setting('dt', 'float')
+        t_max = instance.get_setting('t_max', 'float')
+
+        if instance.resuming():
+            msg = instance.load_snapshot()
+            t_cur = msg.timestamp
+            i, t_stop = msg.data
+
+        if instance.should_init():
+            t_cur = t0
+            i = 0
+            t_stop = t0 + t_max
+
+        rcvd_i = 0
+        while t_cur < t_stop:
+            # faux time-integration for testing snapshots
+            t_next = t_cur + dt
+            if t_next >= t_stop:
+                t_next = None
+            logging.info(f'Sending {i} at {t_cur}, next at {t_next}')
+            instance.send('o_i', Message(t_cur, t_next, i))
+
+            msg = instance.receive('s')
+            logging.info(
+                    f'Received {msg.data} from time {msg.timestamp},'
+                    f' next at {msg.next_timestamp}')
+            assert msg.data >= rcvd_i
+            rcvd_i = msg.data
+
+            t_cur += dt
+            i += 1
+
+            if instance.should_save_snapshot(t_cur):
+                instance.save_snapshot(Message(t_cur, None, [i, t_stop]))
+
+        if instance.should_save_final_snapshot():
+            instance.save_final_snapshot(Message(t_cur, None, [i, t_stop]))
+
+
+def test_snapshot_interact_lockstep(tmp_path):
+    config = f"""ymmsl_version: v0.1
+model:
+  name: test_snapshot
+  components:
+    comp1: component
+    comp2: component
+  conduits:
+    comp1.o_i: comp2.s
+    comp2.o_i: comp1.s
+settings:
+  t0: 0.35
+  dt: 0.1234
+  t_max: 3.0
+  muscle_remote_log_level: {_LOG_LEVEL}
+checkpoints:
+  simulation_time:
+  - every: 1.0
+    start: 0.75
+    stop: 2.0
+  - at:
+    - 2.5"""
+    actors = {f'comp{i + 1}': component for i in range(2)}
+
+    run_dir1 = RunDir(tmp_path / 'run1')
+    run_manager_with_actors(config, run_dir1.path, python_actors=actors)
+
+    assert len(ls_snapshots(run_dir1, 'comp1')) == 3  # t=0.75, 1.75, 2.5
+    assert len(ls_snapshots(run_dir1, 'comp2')) == 3  # t=0.75, 1.75, 2.5
+
+    snapshots_ymmsl = ls_snapshots(run_dir1)
+    snapshot_docs = list(map(load, snapshots_ymmsl))
+    assert len(snapshot_docs) == 3
+
+    # resume from the snapshots taken at t>=1.75
+    run_dir2 = RunDir(tmp_path / 'run2')
+    config_doc = load(config)
+    config_doc.update(snapshot_docs[1])
+
+    run_manager_with_actors(
+            dump(config_doc), run_dir2.path, python_actors=actors)
+
+    assert len(ls_snapshots(run_dir2, 'comp1')) == 2  # resume, t=2.5
+    assert len(ls_snapshots(run_dir2, 'comp2')) == 2  # resume, t=2.5
+    assert len(ls_snapshots(run_dir2)) == 2
+
+
+@pytest.mark.parametrize('scale', [0.1, 0.9, 1.0, 1.1, 1.5])
+def test_snapshot_interact_varstep(tmp_path, scale):
+    config = f"""ymmsl_version: v0.1
+model:
+  name: test_snapshot
+  components:
+    comp1: component
+    comp2: component
+    coupler: checkpointing_temporal_coupler
+  conduits:
+    comp1.o_i: coupler.a_in
+    coupler.a_out: comp1.s
+    comp2.o_i: coupler.b_in
+    coupler.b_out: comp2.s
+settings:
+  t0: 0.35
+  comp1.dt: 0.1234
+  comp2.dt: {0.1234 * scale}
+  t_max: 3.0
+  muscle_remote_log_level: {_LOG_LEVEL}
+checkpoints:
+  simulation_time:
+  - every: 1.0
+    start: 0.75
+    stop: 2.0
+  - at:
+    - 2.5"""
+    actors = {f'comp{i + 1}': component for i in range(2)}
+    actors['coupler'] = interact_coupling.checkpointing_temporal_coupler
+
+    run_dir1 = RunDir(tmp_path / 'run1')
+    run_manager_with_actors(config, run_dir1.path, python_actors=actors)
+
+    assert len(ls_snapshots(run_dir1, 'comp1')) == 3  # t=0.75, 1.75, 2.5
+    assert len(ls_snapshots(run_dir1, 'comp2')) == 3  # t=0.75, 1.75, 2.5
+
+    snapshots_ymmsl = ls_snapshots(run_dir1)
+    snapshot_docs = list(map(load, snapshots_ymmsl))
+    assert len(snapshot_docs) == 3
+
+    # resume from the snapshots taken at t>=1.75
+    run_dir2 = RunDir(tmp_path / 'run2')
+    config_doc = load(config)
+    config_doc.update(snapshot_docs[1])
+
+    run_manager_with_actors(
+            dump(config_doc), run_dir2.path, python_actors=actors)
+
+    assert len(ls_snapshots(run_dir2, 'comp1')) == 2  # resume, t=2.5
+    assert len(ls_snapshots(run_dir2, 'comp2')) == 2  # resume, t=2.5
+    assert len(ls_snapshots(run_dir2)) == 2

From 1b24bdddbacca7d4e1be00c1fc18ba7449dae2b7 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 29 Nov 2022 16:26:35 +0100
Subject: [PATCH 103/183] Fix message=None cases in cmdline tool

---
 muscle3/muscle3.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/muscle3/muscle3.py b/muscle3/muscle3.py
index 484e4335..f715d5fd 100644
--- a/muscle3/muscle3.py
+++ b/muscle3/muscle3.py
@@ -133,7 +133,8 @@ def snapshot(
         typ = 'Final' if snapshot.is_final_snapshot else 'Intermediate'
         properties = OrderedDict([
             ('Snapshot type', typ),
-            ('Snapshot timestamp', snapshot.message.timestamp),
+            ('Snapshot timestamp',
+             snapshot.message.timestamp if snapshot.message else float('-inf')),
             ('Snapshot wallclock time', snapshot.wallclock_time),
             ('Snapshot triggers', snapshot.triggers),
         ])
@@ -146,7 +147,10 @@ def snapshot(
             click.echo(prop_value)
         if data:
             click.secho('Snapshot data:', bold=True)
-            click.echo(snapshot.message.data)
+            if snapshot.message is not None:
+                click.echo(snapshot.message.data)
+            else:
+                click.secho("No data available", italic=True)
         click.echo()
 
 

From 0106b2193e8431271051bcb61d4f6df6cc657ee4 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 29 Nov 2022 16:27:25 +0100
Subject: [PATCH 104/183] Add complex coupling checkpointing test

Checkpointing based on wallclock_time.
---
 .../test_snapshot_complex_coupling.py         | 185 ++++++++++++++++++
 1 file changed, 185 insertions(+)
 create mode 100644 integration_test/test_snapshot_complex_coupling.py

diff --git a/integration_test/test_snapshot_complex_coupling.py b/integration_test/test_snapshot_complex_coupling.py
new file mode 100644
index 00000000..84ffc4af
--- /dev/null
+++ b/integration_test/test_snapshot_complex_coupling.py
@@ -0,0 +1,185 @@
+import random
+import time
+
+import pytest
+from ymmsl import ImplementationState, Operator, load, dump
+
+from libmuscle import Instance, Message
+from libmuscle.manager.run_dir import RunDir
+
+from .conftest import run_manager_with_actors, ls_snapshots
+
+
+_LOG_LEVEL = 'INFO'  # set to DEBUG for additional debug info
+
+
+def cache_component(max_channels=2):
+    ports = {Operator.F_INIT: [f'in{i+1}' for i in range(max_channels)],
+             Operator.O_I: [f'sub_out{i+1}' for i in range(max_channels)],
+             Operator.S: [f'sub_in{i+1}' for i in range(max_channels)],
+             Operator.O_F: [f'out{i+1}' for i in range(max_channels)]}
+    instance = Instance(ports)
+
+    cache_t = float('-inf')
+    cache_data = []
+    max_cache_age = None
+    while instance.reuse_instance():
+        cache_valid_range = instance.get_setting('cache_valid', '[float]')
+        if max_cache_age is None:
+            max_cache_age = random.uniform(*cache_valid_range)
+
+        msgs = [instance.receive(port) if instance.is_connected(port) else None
+                for port in ports[Operator.F_INIT]]
+        cur_t = msgs[0].timestamp
+
+        if cur_t - cache_t >= max_cache_age:
+            # Cached value is no longer valid, run submodel for updated data
+            for msg, port in zip(msgs, ports[Operator.O_I]):
+                if msg is not None:
+                    instance.send(port, Message(cur_t, None, msg.data))
+            cache_data = [instance.receive(port).data
+                          if instance.is_connected(port) else None
+                          for port in ports[Operator.S]]
+            cache_t = cur_t
+            max_cache_age = random.uniform(*cache_valid_range)
+
+        for data, port in zip(cache_data, ports[Operator.O_F]):
+            if data is not None:
+                instance.send(port, Message(cur_t, None, data))
+
+        if instance.should_save_final_snapshot():
+            instance.save_final_snapshot(Message(cur_t, None, []))
+
+
+def echo_component(max_channels=2):
+    ports = {Operator.F_INIT: [f'in{i+1}' for i in range(max_channels)],
+             Operator.O_F: [f'out{i+1}' for i in range(max_channels)]}
+    instance = Instance(ports, stateful=ImplementationState.STATELESS)
+
+    while instance.reuse_instance():
+        for p_in, p_out in zip(ports[Operator.F_INIT], ports[Operator.O_F]):
+            if instance.is_connected(p_in):
+                instance.send(p_out, instance.receive(p_in))
+
+
+def main_component():
+    instance = Instance({
+            Operator.O_I: ['state_out'],
+            Operator.S: ['Ai', 'Bi', 'Ci', 'Di'],
+            Operator.O_F: ['o_f']})
+
+    while instance.reuse_instance():
+        dt = instance.get_setting('dt', 'float')
+        t_max = instance.get_setting('t_max', 'float')
+
+        if instance.resuming():
+            msg = instance.load_snapshot()
+            t_cur = msg.timestamp
+            i, t_remaining = msg.data
+            monotonic_end = time.monotonic() + t_remaining
+
+        if instance.should_init():
+            t_cur = 0
+            monotonic_end = time.monotonic() + t_max
+            i = 0
+
+        while time.monotonic() < monotonic_end:
+            instance.send('state_out', Message(t_cur, None, i))
+            for port in ('Ai', 'Bi', 'Ci', 'Di'):
+                instance.receive(port)
+
+            t_cur += dt
+            i += 1
+            time.sleep(0.05)
+
+            if instance.should_save_snapshot(t_cur):
+                instance.save_snapshot(Message(
+                        t_cur, None, [i, monotonic_end - time.monotonic()]))
+
+        instance.send('o_f', Message(t_cur, None, i))
+
+        if instance.should_save_final_snapshot():
+            instance.save_final_snapshot(Message(t_cur, None, [i, 0]))
+
+
+@pytest.fixture
+def config():
+    return load(f"""ymmsl_version: v0.1
+model:
+  name: test_snapshot
+  components:
+    main: main_component
+    cacheA: cache_component
+    cacheB: cache_component
+    cacheC: cache_component
+    calcA: echo_component
+    calcB: echo_component
+    calcC: echo_component
+    calcD: echo_component
+  conduits:
+    main.state_out:
+    - cacheA.in1
+    - cacheB.in1
+    - cacheC.in1
+    - calcD.in1
+
+    cacheA.out1: main.Ai
+    cacheA.out2: main.Bi
+    cacheA.sub_out1: calcA.in1
+    cacheA.sub_out2: calcA.in2
+    calcA.out1: cacheA.sub_in1
+    calcA.out2: cacheA.sub_in2
+
+    cacheB.out1:
+    - cacheA.in2
+    - cacheC.in2
+    cacheB.sub_out1: calcB.in1
+    calcB.out1: cacheB.sub_in1
+
+    cacheC.out1: main.Ci
+    cacheC.sub_out1: calcC.in1
+    cacheC.sub_out2: calcC.in2
+    calcC.out1: cacheC.sub_in1
+
+    calcD.out1: main.Di
+
+settings:
+  dt: 1.234
+  t_max: 2.0  # seconds
+  cacheA.cache_valid: [2.0, 5.0]
+  cacheB.cache_valid: [3.0, 8.0]
+  cacheC.cache_valid: [4.0, 10.0]
+  muscle_remote_log_level: {_LOG_LEVEL}
+checkpoints:
+  at_end: true
+  wallclock_time:
+  - every: 0.5""")
+
+
+def test_snapshot_complex_coupling(tmp_path, config):
+    actors = {'main': main_component}
+    for c in 'ABC':
+        actors['cache' + c] = cache_component
+    for c in 'ABCD':
+        actors['calc' + c] = echo_component
+
+    run_dir1 = RunDir(tmp_path / 'run1')
+    run_manager_with_actors(
+            dump(config), run_dir1.path, python_actors=actors)
+
+    assert len(ls_snapshots(run_dir1, 'main')) == 5  # 2.0/0.5, at_end
+    assert len(ls_snapshots(run_dir1, 'cacheA')) == 5  # 2.0/0.5, at_end
+    assert len(ls_snapshots(run_dir1, 'cacheB')) == 5  # 2.0/0.5, at_end
+    assert len(ls_snapshots(run_dir1, 'cacheC')) == 5  # 2.0/0.5, at_end
+    # Due to caches, calcA/B/C may not run every 0.5 seconds
+    assert 1 <= len(ls_snapshots(run_dir1, 'calcA')) <= 5
+    assert 1 <= len(ls_snapshots(run_dir1, 'calcB')) <= 5
+    assert 1 <= len(ls_snapshots(run_dir1, 'calcC')) <= 5
+    assert len(ls_snapshots(run_dir1, 'calcD')) == 5  # 2.0/0.5, at_end
+
+    snapshots_ymmsl = ls_snapshots(run_dir1)
+    snapshot_docs = list(map(load, snapshots_ymmsl))
+    # Snapshots based on wallclock time are less reliable. There is at least one
+    # resume yMMSL: the at_end collection. At most 4 more, one for each
+    # wallclock_time checkpoint.
+    assert 1 <= len(snapshot_docs) <= 5

From adb3b3ab12b0fdf5f1014e1aa6a59fb7410d84b0 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Wed, 30 Nov 2022 13:34:30 +0100
Subject: [PATCH 105/183] Remove outdated comments

---
 libmuscle/python/libmuscle/communicator.py              | 3 ---
 libmuscle/python/libmuscle/manager/snapshot_registry.py | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py
index 5c68ca4d..bf1cf33e 100644
--- a/libmuscle/python/libmuscle/communicator.py
+++ b/libmuscle/python/libmuscle/communicator.py
@@ -324,8 +324,6 @@ def receive_message(self, port_name: str, slot: Optional[int] = None,
         profile_event.message_size = len(mcp_message_bytes)
 
         expected_message_number = port.get_num_messages(slot)
-        # TODO: handle f_init port counts for STATELESS and WEAKLY_STATEFUL
-        # components which didn't load a snapshot
         if expected_message_number != mcp_message.message_number:
             if (expected_message_number - 1 == mcp_message.message_number and
                     port.is_resuming(slot)):
@@ -388,7 +386,6 @@ def restore_message_counts(self, port_message_counts: Dict[str, List[int]]
                 raise RuntimeError(f'Unknown port {port_name} in snapshot.'
                                    ' Have your port definitions changed since'
                                    ' the snapshot was taken?')
-        # TODO decide if we should check whether all ports are covered
 
     def get_message_counts(self) -> Dict[str, List[int]]:
         """Get message counts for all ports on the communicator
diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py
index dcf7c3e8..3883ea9a 100644
--- a/libmuscle/python/libmuscle/manager/snapshot_registry.py
+++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py
@@ -161,7 +161,7 @@ def do_consistency_check(
             else:
                 consistent = calc_consistency_list(
                         i_msg_counts, p_msg_counts, is_sending, peer_is_restart)
-            if not consistent:  # not consistent
+            if not consistent:
                 return False
         self.consistent_peers.setdefault(
                 peer_node.instance, []).append(peer_node)

From 876e6267b354320707b3e33dd85487c660d37435 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Thu, 1 Dec 2022 09:43:52 +0100
Subject: [PATCH 106/183] Remove Python 3.6 support

---
 .github/workflows/ci.yaml                     |  4 ++--
 .../workflows/ci_python_compatibility.yaml    |  7 +------
 .github/workflows/ci_ubuntu18.04.yaml         | 19 -------------------
 .github/workflows/ci_ubuntu18.04_clang.yaml   | 19 -------------------
 docs/source/examples/python/requirements.txt  |  1 -
 docs/source/installing.rst.in                 |  2 +-
 setup.py                                      |  4 +---
 tox.ini                                       |  3 +--
 8 files changed, 6 insertions(+), 53 deletions(-)
 delete mode 100644 .github/workflows/ci_ubuntu18.04.yaml
 delete mode 100644 .github/workflows/ci_ubuntu18.04_clang.yaml

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 351ede6c..16b81e96 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -8,10 +8,10 @@ jobs:
     steps:
     - uses: actions/checkout@v2
 
-    - name: Set up Python 3.6
+    - name: Set up Python 3.8
       uses: actions/setup-python@v1
       with:
-        python-version: 3.6
+        python-version: 3.8
 
     - name: Install dependencies
       run: |
diff --git a/.github/workflows/ci_python_compatibility.yaml b/.github/workflows/ci_python_compatibility.yaml
index bdc2e86b..89bc9126 100644
--- a/.github/workflows/ci_python_compatibility.yaml
+++ b/.github/workflows/ci_python_compatibility.yaml
@@ -6,7 +6,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"]
+        python-version: ["3.7", "3.8", "3.9", "3.10"]
 
     steps:
     - name: Check out the source code
@@ -30,8 +30,3 @@ jobs:
 
     - name: Run the test suite
       run: make test_python_only
-
-    - name: Upload coverage report to Codacy
-      uses: codacy/codacy-coverage-reporter-action@master
-      with:
-        project-token: ${{ secrets.CODACY_PROJECT_TOKEN }}
diff --git a/.github/workflows/ci_ubuntu18.04.yaml b/.github/workflows/ci_ubuntu18.04.yaml
deleted file mode 100644
index 9e0448ac..00000000
--- a/.github/workflows/ci_ubuntu18.04.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Run Continuous Integration for the latest Ubuntu release
-# This mainly checks for issues/regressions in the native build
-name: native_compatibility_ubuntu18.04
-on:
-  schedule:
-    - cron: '0 2 * * 0'
-  push:
-    branches:
-      - 'release-*'
-      - fix_native_compatibility_ci
-jobs:
-  build:
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v2
-
-    - name: Run tests on Ubuntu 18.04
-      run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:18.04 /bin/bash -c 'apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential cmake gfortran git valgrind libopenmpi-dev pkg-config python3 python3-pip python3-venv curl && apt-get -y remove libssl-dev zlib1g-dev && useradd -m -d /home/muscle3 muscle3 && echo "Added user" && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && echo "Copied files" && su muscle3 -c -- "pip3 install --user -U \"pip<22\" setuptools wheel" && su muscle3 -c -- "pip3 install --user \"ymmsl>=0.12.0,<0.13\" qcg-pilotjob==0.13.1" && su muscle3 -c -- "cd /home/muscle3/muscle3 && make test_examples"'
diff --git a/.github/workflows/ci_ubuntu18.04_clang.yaml b/.github/workflows/ci_ubuntu18.04_clang.yaml
deleted file mode 100644
index 49864bc2..00000000
--- a/.github/workflows/ci_ubuntu18.04_clang.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-# Run Continuous Integration for the latest Ubuntu release
-# This mainly checks for issues/regressions in the native build
-name: native_compatibility_ubuntu18.04_clang
-on:
-  schedule:
-    - cron: '30 2 * * 0'
-  push:
-    branches:
-      - 'release-*'
-      - fix_native_compatibility_ci
-jobs:
-  build:
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v2
-
-    - name: Run tests on Ubuntu 18.04 with Clang
-      run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:18.04 /bin/bash -c 'apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential clang cmake gfortran git valgrind libopenmpi-dev pkg-config python3 python3-pip python3-venv curl && apt-get -y remove libssl-dev zlib1g-dev && useradd -m -d /home/muscle3 muscle3 && echo "Added user" && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && echo "Copied files" && su muscle3 -c -- "pip3 install --user -U \"pip<22\" setuptools wheel" && su muscle3 -c -- "pip3 install --user \"ymmsl>=0.12.0,<0.13\" qcg-pilotjob==0.13.1" && su muscle3 -c -- "cd /home/muscle3/muscle3 && CXXFLAGS=-fPIE OMPI_CXX=clang++ CXX=clang++ make test_examples"'
diff --git a/docs/source/examples/python/requirements.txt b/docs/source/examples/python/requirements.txt
index 8e2ef00e..fa14df52 100644
--- a/docs/source/examples/python/requirements.txt
+++ b/docs/source/examples/python/requirements.txt
@@ -1,5 +1,4 @@
 matplotlib>=3,<4
-numpy==1.19.5; python_version=='3.6'
 numpy<1.22; python_version=='3.7'
 numpy>=1.22,<=1.25; python_version>='3.8'
 sobol_seq==0.2.0
diff --git a/docs/source/installing.rst.in b/docs/source/installing.rst.in
index cbafedad..25620d03 100644
--- a/docs/source/installing.rst.in
+++ b/docs/source/installing.rst.in
@@ -12,7 +12,7 @@ Python
 
 Installing MUSCLE3 on Python will install all the Python-based components of
 the system, i.e. the Python version of libmuscle, the YMMSL Python library, and
-the MUSCLE Manager. This requires at least Python 3.6.
+the MUSCLE Manager. This requires at least Python 3.7.
 
 MUSCLE3 is on PyPI as an ordinary Python package, so it can be installed via
 Pip in the usual way. It's normally a good idea to make a virtual environment
diff --git a/setup.py b/setup.py
index b99e2d06..467e2595 100644
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,6 @@
         'Development Status :: 4 - Beta',
         'License :: OSI Approved :: Apache Software License',
         'Operating System :: POSIX :: Linux',
-        'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
         'Programming Language :: Python :: 3.9',
@@ -45,12 +44,11 @@
             'muscle_manager=muscle3.muscle_manager:manage_simulation',
             'muscle3=muscle3.muscle3:muscle3']
     },
-    python_requires='>=3.6, <4',
+    python_requires='>=3.7, <4',
     install_requires=[
         'click>=7.1,<9',
         'msgpack>=1,<2',
         'netifaces==0.11.0',
-        "numpy==1.19.5; python_version=='3.6'",
         "numpy<1.22; python_version=='3.7'",
         "numpy>=1.22,<=1.25; python_version>='3.8'",
         'qcg-pilotjob==0.13.1',
diff --git a/tox.ini b/tox.ini
index 9c6c3968..d556d6ee 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = py36, py37, py38, py39, py310
+envlist = py37, py38, py39, py310
 skip_missing_interpreters = true
 
 [testenv]
@@ -22,7 +22,6 @@ commands =
 
 [gh-actions]
 python =
-    3.6: py36
     3.7: py37
     3.8: py38
     3.9: py39

From 96e64737914bb428eeec9a81e1af8e01336ff965 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Thu, 1 Dec 2022 10:00:51 +0100
Subject: [PATCH 107/183] Limit flake8 to <6 for now (#137)

---
 tox.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tox.ini b/tox.ini
index d556d6ee..6f71615f 100644
--- a/tox.ini
+++ b/tox.ini
@@ -5,7 +5,7 @@ skip_missing_interpreters = true
 [testenv]
 deps =
     mypy
-    flake8
+    flake8<6
     pytest
     pytest-cov
     git+https://github.com/multiscale/ymmsl-python.git@feature/multicast#egg=ymmsl

From 3a55125d884f4446bb1fe053c0594e50a6c3cf94 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 1 Dec 2022 13:13:11 +0100
Subject: [PATCH 108/183] Remove python 3.6 support

---
 setup.py | 1 -
 tox.ini  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/setup.py b/setup.py
index 9b564beb..467e2595 100644
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,6 @@
         "numpy>=1.22,<=1.25; python_version>='3.8'",
         'qcg-pilotjob==0.13.1',
         'typing_extensions<4',
-        "dataclasses; python_version=='3.6'",
         'ymmsl>=0.12.0,<0.13'          # Also in CI, update there as well
     ],
     extras_require={
diff --git a/tox.ini b/tox.ini
index 54b84ba9..006e8901 100644
--- a/tox.ini
+++ b/tox.ini
@@ -9,7 +9,6 @@ deps =
     pytest
     pytest-cov
     git+https://github.com/multiscale/ymmsl-python.git@feature/checkpointing#egg=ymmsl
-    types-dataclasses; python_version=='3.6'
 
 passenv =
     MUSCLE_TEST_PYTHON_ONLY

From 48f72b51e7c067f10b0a0bd86f23803018763053 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 1 Dec 2022 15:16:03 +0100
Subject: [PATCH 109/183] Add links to the github page in contributing.rst

---
 docs/source/contributing.rst | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst
index c3132e5e..f10fe77b 100644
--- a/docs/source/contributing.rst
+++ b/docs/source/contributing.rst
@@ -22,7 +22,7 @@ proceed as below.
 Make an Issue
 =============
 
-Issues are found in a tab at the top of the repository home page. Please check
+`Issues`_ are found in a tab at the top of `the repository home page`_. Please check
 to see that the bug you want to fix or the feature you want to add does not
 already have an issue dedicated to it. If it does, feel free to add to the
 discussion. If not, please make a new issue.
@@ -48,6 +48,9 @@ describe
 If you want to fix the bug or implement the feature yourself, you'll have to set
 up a development environment.
 
+.. _Issues: https://github.com/multiscale/muscle3/issues
+.. _the repository home page: https://github.com/multiscale/muscle3/
+
 
 Get a local repository
 ======================

From 52149ca8976f60f356055713889145f7efaecdb6 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 2 Dec 2022 10:45:49 +0100
Subject: [PATCH 110/183] Use sphinx-click to generate cmdline tool docs

---
 docs/requirements.txt              |  1 +
 docs/source/command_line_tools.rst | 12 ++++++++++++
 docs/source/conf.py                |  3 ++-
 docs/source/index.rst              |  1 +
 tox.ini                            |  1 +
 5 files changed, 17 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/command_line_tools.rst

diff --git a/docs/requirements.txt b/docs/requirements.txt
index 98b8214f..5d61eae3 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -7,5 +7,6 @@ qcg-pilotjob
 six
 sphinx-fortran
 sphinx-tabs
+sphinx-click
 typing==3.6.6
 ymmsl
diff --git a/docs/source/command_line_tools.rst b/docs/source/command_line_tools.rst
new file mode 100644
index 00000000..e98e7552
--- /dev/null
+++ b/docs/source/command_line_tools.rst
@@ -0,0 +1,12 @@
+Command line tools
+==================
+
+.. click:: muscle3.muscle_manager:manage_simulation
+    :prog: muscle_manager
+    :nested: full
+
+
+.. click:: muscle3.muscle3:muscle3
+    :prog: muscle3
+    :nested: full
+
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 7464d1c0..56761f46 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -51,7 +51,8 @@
         'sphinx.ext.todo',
         'sphinx.ext.viewcode',
         'sphinxfortran.fortran_domain',
-        'sphinx_tabs.tabs']
+        'sphinx_tabs.tabs',
+        'sphinx_click']
 
 # Add any paths that contain templates here, relative to this directory.
 # templates_path = ['_templates']
diff --git a/docs/source/index.rst b/docs/source/index.rst
index ed55ba48..9b1a093b 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -41,6 +41,7 @@ Cham.  `<https://doi.org/10.1007/978-3-030-50433-5_33>`_
    python_api
    cpp_api
    fortran_api
+   command_line_tools
 
    contributing
    devtools
diff --git a/tox.ini b/tox.ini
index 6f71615f..045f7e5b 100644
--- a/tox.ini
+++ b/tox.ini
@@ -41,5 +41,6 @@ deps =
     sphinx-fortran
     sphinx-tabs
     sphinx_rtd_theme
+    sphinx-click
 commands = sphinx-build docs/source docs/build -bhtml
 

From bd1386c1d34945968f218cabe2f347d8178e421a Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 5 Dec 2022 14:13:02 +0100
Subject: [PATCH 111/183] Fix should_save_final_snapshot when not reusing

ClosePort messages have `inf` timestamps, so would always trigger a
final snapshot. Only expected when `at_end` checkpoints should be taken.
---
 libmuscle/python/libmuscle/checkpoint_triggers.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py
index a4edf3be..4299977a 100644
--- a/libmuscle/python/libmuscle/checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/checkpoint_triggers.py
@@ -256,9 +256,10 @@ def should_save_final_snapshot(
         self.__check_should_have_saved()
 
         value = False
-        if not do_reuse and self._checkpoint_at_end:
-            value = True
-            self._last_triggers.append('at_end')
+        if not do_reuse:
+            if self._checkpoint_at_end:
+                value = True
+                self._last_triggers.append('at_end')
         elif f_init_max_timestamp is None:
             # No F_INIT messages received: reuse triggered on muscle_settings_in
             # message.

From 91d2aaad82014d38d1a385d516dc1f300f062d16 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Mon, 2 Jan 2023 15:03:30 +0100
Subject: [PATCH 112/183] Fix leftover value from previous protocol design

---
 libmuscle/python/libmuscle/test/test_mmp_client.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/libmuscle/python/libmuscle/test/test_mmp_client.py b/libmuscle/python/libmuscle/test/test_mmp_client.py
index a47311a6..d5051962 100644
--- a/libmuscle/python/libmuscle/test/test_mmp_client.py
+++ b/libmuscle/python/libmuscle/test/test_mmp_client.py
@@ -1,4 +1,3 @@
-from datetime import datetime, timezone
 from unittest.mock import patch
 
 import msgpack
@@ -74,10 +73,7 @@ def test_get_settings(mocked_mmp_client) -> None:
 def test_register_instance(mocked_mmp_client) -> None:
     client, stub = mocked_mmp_client
 
-    result = [ResponseType.SUCCESS.value,
-              (datetime.now(timezone.utc).timestamp(),
-               {'wallclock_time': [], 'simulation_time': []},
-               None)]
+    result = [ResponseType.SUCCESS.value]
     stub.call.return_value = msgpack.packb(result, use_bin_type=True)
 
     client.register_instance(

From b58567b1875a0b10baac88866822081747e1a490 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Mon, 2 Jan 2023 15:06:05 +0100
Subject: [PATCH 113/183] Fix a few typos

---
 libmuscle/cpp/src/libmuscle/communicator.cpp     | 2 +-
 libmuscle/python/libmuscle/instance.py           | 2 +-
 libmuscle/python/libmuscle/manager/mmp_server.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libmuscle/cpp/src/libmuscle/communicator.cpp b/libmuscle/cpp/src/libmuscle/communicator.cpp
index 25526e4b..269a1139 100644
--- a/libmuscle/cpp/src/libmuscle/communicator.cpp
+++ b/libmuscle/cpp/src/libmuscle/communicator.cpp
@@ -219,7 +219,7 @@ Message Communicator::receive_message(
             if (slot.is_set())
                 logger_.debug("Discarding received message on ", port_name,
                               "[", slot.get(), "]: resuming from weakly",
-                              " constistent snapshot");
+                              " consistent snapshot");
             else
                 logger_.debug("Discarding received message on ", port_name,
                               ": resuming from weakly constistent snapshot");
diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index a0984fa2..245775e8 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -661,7 +661,7 @@ def __check_reuse_instance(self, apply_overlay: bool) -> bool:
         """Pre-receive F_INIT messages and detect if this instance is reused.
 
         This is called during :meth:`should_save_final_snapshot` to detect if a
-        snapshot must be taken. If an instance does implement checkpointing,
+        snapshot must be taken. If an instance doesn't implement checkpointing,
         :meth:`reuse_instance` will call it instead.
         """
         do_reuse = self.__receive_settings()
diff --git a/libmuscle/python/libmuscle/manager/mmp_server.py b/libmuscle/python/libmuscle/manager/mmp_server.py
index 90617fae..f689e6f5 100644
--- a/libmuscle/python/libmuscle/manager/mmp_server.py
+++ b/libmuscle/python/libmuscle/manager/mmp_server.py
@@ -288,7 +288,7 @@ def _get_checkpoint_info(self, instance_id: str) -> Any:
             status (ResponseType): SUCCESS
             wallclock_reference_time (float): Unix timestamp (in UTC) indicating
                 wallclock time of the start of the workflow.
-            checkpoints (dict): Dictionary encdoing a ymmsl.Checkpoints object.
+            checkpoints (dict): Dictionary encoding a ymmsl.Checkpoints object.
             resume_path (Optional[str]): Checkpoint filename to resume from.
             snapshot_directory (Optional[str]): Directory to store instance
                 snapshots.

From 32b24930c8e52cd8aeccfcece8d7102ffac59705 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Mon, 2 Jan 2023 15:06:35 +0100
Subject: [PATCH 114/183] Improve comments to not duplicate typo annotations

---
 libmuscle/python/libmuscle/mmp_client.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py
index 188814ff..14e83e9a 100644
--- a/libmuscle/python/libmuscle/mmp_client.py
+++ b/libmuscle/python/libmuscle/mmp_client.py
@@ -74,15 +74,15 @@ def decode_checkpoint_info(
     Args:
         reference_timestamp: seconds since UNIX epoch in UTC timezone to use as
             wallclock_time = 0
-        checkpoints_dict: dictionary of checkpoint definitions
-        resume: optional string indicating resume path
-        snapshot_dir: optional string indicating path to store snapshots in
+        checkpoints_dict: checkpoint definitions from the MsgPack
+        resume: path to the snapshot we should resume from, if any
+        snapshot_dir: path to the directory to store new snapshots in
 
     Returns:
         wallclock_time_reference: UTC time where wallclock_time = 0
         checkpoints: checkpoint configuration
         resume: path to the resume snapshot
-        snapshot_dir: optional path to store snapshots in
+        snapshot_dir: path to store the snapshots in
     """
     ref_time = datetime.fromtimestamp(reference_timestamp, tz=timezone.utc)
     checkpoints = Checkpoints(

From 0409516574975dce2eac2641e868c9439986954a Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Mon, 2 Jan 2023 15:07:37 +0100
Subject: [PATCH 115/183] Use default message instead of checking for
 connection

---
 integration_test/test_snapshot_complex_coupling.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/integration_test/test_snapshot_complex_coupling.py b/integration_test/test_snapshot_complex_coupling.py
index 84ffc4af..79b112d4 100644
--- a/integration_test/test_snapshot_complex_coupling.py
+++ b/integration_test/test_snapshot_complex_coupling.py
@@ -23,29 +23,28 @@ def cache_component(max_channels=2):
     cache_t = float('-inf')
     cache_data = []
     max_cache_age = None
+    nil_msg = Message(0.0, None, None)
+
     while instance.reuse_instance():
         cache_valid_range = instance.get_setting('cache_valid', '[float]')
         if max_cache_age is None:
             max_cache_age = random.uniform(*cache_valid_range)
 
-        msgs = [instance.receive(port) if instance.is_connected(port) else None
+        msgs = [instance.receive(port, default=nil_msg)
                 for port in ports[Operator.F_INIT]]
         cur_t = msgs[0].timestamp
 
         if cur_t - cache_t >= max_cache_age:
             # Cached value is no longer valid, run submodel for updated data
             for msg, port in zip(msgs, ports[Operator.O_I]):
-                if msg is not None:
-                    instance.send(port, Message(cur_t, None, msg.data))
-            cache_data = [instance.receive(port).data
-                          if instance.is_connected(port) else None
+                instance.send(port, Message(cur_t, None, msg.data))
+            cache_data = [instance.receive(port, default=nil_msg).data
                           for port in ports[Operator.S]]
             cache_t = cur_t
             max_cache_age = random.uniform(*cache_valid_range)
 
         for data, port in zip(cache_data, ports[Operator.O_F]):
-            if data is not None:
-                instance.send(port, Message(cur_t, None, data))
+            instance.send(port, Message(cur_t, None, data))
 
         if instance.should_save_final_snapshot():
             instance.save_final_snapshot(Message(cur_t, None, []))

From bca4d83fb91d5ef1e14acbfa06b0f0bbf7d5e2f7 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Mon, 2 Jan 2023 15:09:03 +0100
Subject: [PATCH 116/183] Use default values to simplify member functions

---
 libmuscle/cpp/src/libmuscle/port.cpp | 59 ++++----------------------
 libmuscle/cpp/src/libmuscle/port.hpp | 62 ++--------------------------
 2 files changed, 13 insertions(+), 108 deletions(-)

diff --git a/libmuscle/cpp/src/libmuscle/port.cpp b/libmuscle/cpp/src/libmuscle/port.cpp
index 70db0550..1dd32a44 100644
--- a/libmuscle/cpp/src/libmuscle/port.cpp
+++ b/libmuscle/cpp/src/libmuscle/port.cpp
@@ -137,66 +137,25 @@ const std::vector<int> & Port::get_message_counts() const {
     return num_messages_;
 }
 
-void Port::increment_num_messages() {
-    num_messages_[0] ++;
-    set_resumed();
-}
-
-void Port::increment_num_messages(int slot) {
-    num_messages_[slot] ++;
-    set_resumed(slot);
-}
-
 void Port::increment_num_messages(Optional<int> slot) {
-    if(slot.is_set())
-        increment_num_messages(slot.get());
-    else
-        increment_num_messages();
-}
-
-int Port::get_num_messages() const {
-    return num_messages_[0];
-}
-
-int Port::get_num_messages(int slot) const {
-    return num_messages_[slot];
+    int s = slot.is_set() ? slot.get() : 0;
+    num_messages_[s] ++;
+    set_resumed(s);
 }
 
 int Port::get_num_messages(Optional<int> slot) const {
-    if(slot.is_set())
-        return get_num_messages(slot.get());
-    else
-        return get_num_messages();
-}
-
-bool Port::is_resuming() const {
-    return is_resuming_[0];
-}
-
-bool Port::is_resuming(int slot) const {
-    return is_resuming_[slot];
+    int s = slot.is_set() ? slot.get() : 0;
+    return num_messages_[s];
 }
 
 bool Port::is_resuming(Optional<int> slot) const {
-    if(slot.is_set())
-        return is_resuming(slot.get());
-    else
-        return is_resuming();
-}
-
-void Port::set_resumed() {
-    is_resuming_[0] = false;
-}
-
-void Port::set_resumed(int slot) {
-    is_resuming_[slot] = false;
+    int s = slot.is_set() ? slot.get() : 0;
+    return is_resuming_[s];
 }
 
 void Port::set_resumed(Optional<int> slot) {
-    if(slot.is_set())
-        set_resumed(slot.get());
-    else
-        set_resumed();
+    int s = slot.is_set() ? slot.get() : 0;
+    is_resuming_[s] = false;
 }
 
 } }
diff --git a/libmuscle/cpp/src/libmuscle/port.hpp b/libmuscle/cpp/src/libmuscle/port.hpp
index 18cfb5d9..908fb270 100644
--- a/libmuscle/cpp/src/libmuscle/port.hpp
+++ b/libmuscle/cpp/src/libmuscle/port.hpp
@@ -118,59 +118,17 @@ class Port : public ::ymmsl::Port {
          */
         const std::vector<int> & get_message_counts() const;
 
-        /** Increment amount of messages sent or received.
-         */
-        void increment_num_messages();
-
-        /** Increment amount of messages sent or received.
-         *
-         * Only valid for vector ports.
-         *
-         * @param slot The slot that is sent/received on
-         */
-        void increment_num_messages(int slot);
-
         /** Increment amount of messages sent or received.
          *
          * @param slot The slot that is sent/received on
          */
-        void increment_num_messages(Optional<int> slot);
+        void increment_num_messages(Optional<int> slot = {});
 
         /** Get the amount of messages sent or received
-         */
-        int get_num_messages() const;
-
-        /** Get the amount of messages sent or received
-         *
-         * Only valid for vector ports.
-         *
-         * @param slot The slot that is sent/received on
-         */
-        int get_num_messages(int slot) const;
-
-        /** Get the amount of messages sent or received
-         *
-         * @param slot The slot that is sent/received on
-         */
-        int get_num_messages(Optional<int> slot) const;
-
-        /** True when this port has resumed.
-         *
-         * After resumption, each port/slot may discard exactly one message.
-         * is_resuming keeps track of this state.
-         */
-        bool is_resuming() const;
-
-        /** True when this port has resumed.
-         *
-         * After resumption, each port/slot may discard exactly one message.
-         * is_resuming keeps track of this state.
-         *
-         * Only valid for vector ports.
          *
          * @param slot The slot that is sent/received on
          */
-        bool is_resuming(int slot) const;
+        int get_num_messages(Optional<int> slot = {}) const;
 
         /** True when this port has resumed.
          *
@@ -179,25 +137,13 @@ class Port : public ::ymmsl::Port {
          *
          * @param slot The slot that is sent/received on
          */
-        bool is_resuming(Optional<int> slot) const;
-
-        /** Mark that this port has resumed and may no longer discard messages.
-         */
-        void set_resumed();
-
-        /** Mark that this port has resumed and may no longer discard messages.
-         *
-         * Only valid for vector ports.
-         *
-         * @param slot The slot that is sent/received on
-         */
-        void set_resumed(int slot);
+        bool is_resuming(Optional<int> slot = {}) const;
 
         /** Mark that this port has resumed and may no longer discard messages.
          *
          * @param slot The slot that is sent/received on
          */
-        void set_resumed(Optional<int> slot);
+        void set_resumed(Optional<int> slot = {});
 
     private:
         bool is_connected_;

From db6d030589f899fd9748994a8f196eb9779eb175 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Tue, 3 Jan 2023 15:58:56 +0100
Subject: [PATCH 117/183] Factor out API checking

---
 .../test_snapshot_complex_coupling.py         |  16 +-
 libmuscle/python/libmuscle/api_guard.py       | 221 ++++++++++++++++++
 .../python/libmuscle/checkpoint_triggers.py   |  87 +------
 libmuscle/python/libmuscle/instance.py        |  50 ++--
 .../python/libmuscle/snapshot_manager.py      |  21 +-
 libmuscle/python/libmuscle/test/conftest.py   |   6 +
 .../python/libmuscle/test/test_api_guard.py   | 154 ++++++++++++
 .../test/test_checkpoint_triggers.py          |  46 +---
 .../libmuscle/test/test_snapshot_manager.py   |  12 +-
 9 files changed, 447 insertions(+), 166 deletions(-)
 create mode 100644 libmuscle/python/libmuscle/api_guard.py
 create mode 100644 libmuscle/python/libmuscle/test/test_api_guard.py

diff --git a/integration_test/test_snapshot_complex_coupling.py b/integration_test/test_snapshot_complex_coupling.py
index 79b112d4..a75a89ce 100644
--- a/integration_test/test_snapshot_complex_coupling.py
+++ b/integration_test/test_snapshot_complex_coupling.py
@@ -26,13 +26,17 @@ def cache_component(max_channels=2):
     nil_msg = Message(0.0, None, None)
 
     while instance.reuse_instance():
-        cache_valid_range = instance.get_setting('cache_valid', '[float]')
-        if max_cache_age is None:
-            max_cache_age = random.uniform(*cache_valid_range)
+        if instance.resuming():
+            instance.load_snapshot()
+
+        if instance.should_init():
+            cache_valid_range = instance.get_setting('cache_valid', '[float]')
+            if max_cache_age is None:
+                max_cache_age = random.uniform(*cache_valid_range)
 
-        msgs = [instance.receive(port, default=nil_msg)
-                for port in ports[Operator.F_INIT]]
-        cur_t = msgs[0].timestamp
+            msgs = [instance.receive(port, default=nil_msg)
+                    for port in ports[Operator.F_INIT]]
+            cur_t = msgs[0].timestamp
 
         if cur_t - cache_t >= max_cache_age:
             # Cached value is no longer valid, run submodel for updated data
diff --git a/libmuscle/python/libmuscle/api_guard.py b/libmuscle/python/libmuscle/api_guard.py
new file mode 100644
index 00000000..1f4fbfd3
--- /dev/null
+++ b/libmuscle/python/libmuscle/api_guard.py
@@ -0,0 +1,221 @@
+from enum import auto, Enum
+from typing import Optional
+
+
+class APIPhase(Enum):
+    """Different phases that the user code traverses.
+
+    These values describe different regions that the model code can be
+    in for the case where checkpointing is implemented. By tracking
+    the phase that the model should be in, we can detect incorrect API
+    usage.
+
+    This does not match the yMMSL operators, as it is more
+    fine-grained and concerns checkpointing, which is not represented
+    in the SEL.
+
+    Note that AFTER_REUSE_INSTANCE and BEFORE_RESUMING refer to the
+    same place in the code. AFTER_REUSE_INSTANCE is used when we
+    don't know yet if the code has checkpointing support, and so we
+    don't know whether the next call is to resuming() or to
+    reuse_instance(). Once a checkpointing function has been called,
+    we know that we should expect resume() after reuse_instance() and
+    we use BEFORE_RESUMING accordingly.
+    """
+    BEFORE_REUSE_INSTANCE = auto()
+    """Before calling reuse_instance"""
+
+    AFTER_REUSE_INSTANCE = auto()
+    """At the top of the reuse loop"""
+
+    BEFORE_RESUMING = auto()
+    """Between reuse_instance and resuming"""
+
+    BEFORE_LOAD_SNAPSHOT = auto()
+    """Between resuming and load_snapshot"""
+
+    BEFORE_SHOULD_INIT = auto()
+    """After resuming, before should_init"""
+
+    BEFORE_SHOULD_SAVE_SNAPSHOT = auto()
+    """Between should_init and should_save*"""
+
+    BEFORE_SAVE_SNAPSHOT = auto()
+    """Between should_save_snapshot and save_snapshot"""
+
+    BEFORE_SAVE_FINAL_SNAPSHOT = auto()
+    """Between should_save_final_snapshot and save_final_snapshot"""
+
+    AFTER_REUSE_LOOP = auto()
+    """After the final call to reuse_instance()"""
+
+
+class APIGuard:
+    """Keeps track of and checks in which phase the model is.
+
+    The verify_* functions are called when the corresponding function
+    on Instance is called, to check that we're in the right phase. They
+    raise a RuntimeError if there's a problem. The *_done functions are
+    called to signal that the corresponding function finished
+    successfully, and that we are moving on to the next phase.
+    """
+    def __init__(self) -> None:
+        """Create an APIPhaseTracker.
+
+        This starts the tracker in BEFORE_REUSE_INSTANCE.
+        """
+        self._phase = APIPhase.BEFORE_REUSE_INSTANCE
+        self._uses_checkpointing = None     # type: Optional[bool]
+
+    def uses_checkpointing(self) -> bool:
+        """Return whether the code is using checkpointing.
+
+        We can only determine that the code doesn't use checkpointing
+        if there are no checkpointing calls between the first and
+        second calls to reuse_instance. So this function should only
+        be called after the second call to verify_reuse_instance, or
+        it may raise if the code does not use checkpointing.
+
+        Raises:
+            RuntimeError: if we are at a point where we cannot know
+                the answer yet.
+        """
+        if self._uses_checkpointing is not None:
+            return self._uses_checkpointing
+        raise RuntimeError(
+                'The API was implemented incorrectly, please consult the'
+                ' documentation.')
+
+    def verify_reuse_instance(self) -> None:
+        """Check reuse_instance()"""
+        if self._phase == APIPhase.AFTER_REUSE_INSTANCE:
+            self._uses_checkpointing = False
+        elif self._phase != APIPhase.BEFORE_REUSE_INSTANCE:
+            raise RuntimeError()
+
+    def reuse_instance_done(self, reusing: bool) -> None:
+        """Update phase on successful reuse_instance().
+
+        Args:
+            reusing: Whether we are reusing or not.
+        """
+        if not reusing:
+            self._phase = APIPhase.AFTER_REUSE_LOOP
+        else:
+            if self._uses_checkpointing is None:
+                self._phase = APIPhase.AFTER_REUSE_INSTANCE
+            elif self._uses_checkpointing:
+                self._phase = APIPhase.BEFORE_RESUMING
+            else:
+                self._phase = APIPhase.BEFORE_REUSE_INSTANCE
+
+    def verify_resuming(self) -> None:
+        """Check resuming()"""
+        if self._phase not in (
+                APIPhase.BEFORE_RESUMING, APIPhase.AFTER_REUSE_INSTANCE):
+            raise RuntimeError(
+                    'Please call resuming() only as the first thing in the'
+                    ' reuse loop.')
+
+    def resuming_done(self, resuming: bool) -> None:
+        """Update phase on successful resuming().
+
+        Args:
+            resuming: Whether we're resuming or not.
+        """
+        self._uses_checkpointing = True
+        if resuming:
+            self._phase = APIPhase.BEFORE_LOAD_SNAPSHOT
+        else:
+            self._phase = APIPhase.BEFORE_SHOULD_INIT
+
+    def verify_load_snapshot(self) -> None:
+        """Check load_snapshot()"""
+        if self._phase != APIPhase.BEFORE_LOAD_SNAPSHOT:
+            raise RuntimeError(
+                    'Please check that we are resuming by calling resuming()'
+                    ' before calling load_snapshot()')
+
+    def load_snapshot_done(self) -> None:
+        """Update phase on successful load_snapshot()"""
+        self._phase = APIPhase.BEFORE_SHOULD_INIT
+
+    def verify_should_init(self) -> None:
+        """Check should_init()"""
+        if self._phase != APIPhase.BEFORE_SHOULD_INIT:
+            raise RuntimeError(
+                    'Please check whether to run f_init using should_init()'
+                    ' after resuming, and before trying to save a snapshot.')
+
+    def should_init_done(self) -> None:
+        """Update phase on successful should_init()"""
+        self._phase = APIPhase.BEFORE_SHOULD_SAVE_SNAPSHOT
+
+    def verify_should_save_snapshot(self) -> None:
+        """Check should_save_snapshot()"""
+        if self._phase != APIPhase.BEFORE_SHOULD_SAVE_SNAPSHOT:
+            raise RuntimeError(
+                    'We reached the end of the reuse loop without checking'
+                    ' if a snapshot should be saved. Please add at least'
+                    ' a should_save_final_snapshot and save_final_snapshot.')
+
+    def should_save_snapshot_done(self, should_save: bool) -> None:
+        """Update phase on successful should_save_snapshot().
+
+        Args:
+            should_save: Whether we should save or not.
+        """
+        if should_save:
+            self._phase = APIPhase.BEFORE_SAVE_SNAPSHOT
+
+    def verify_save_snapshot(self) -> None:
+        """Check should_save_snapshot()"""
+        if self._phase != APIPhase.BEFORE_SAVE_SNAPSHOT:
+            raise RuntimeError()
+
+    def save_snapshot_done(self) -> None:
+        """Update phase on successful save_snapshot()"""
+        self._phase = APIPhase.BEFORE_SHOULD_SAVE_SNAPSHOT
+
+    def verify_should_save_final_snapshot(self) -> None:
+        """Check should_save_final_snapshot()."""
+        if self._phase != APIPhase.BEFORE_SHOULD_SAVE_SNAPSHOT:
+            if self._phase in (
+                    APIPhase.BEFORE_REUSE_INSTANCE, APIPhase.AFTER_REUSE_LOOP):
+                msg = (
+                        'Please only call should_save_final_snapshot inside'
+                        ' the reuse loop.')
+            elif self._phase == APIPhase.BEFORE_SAVE_FINAL_SNAPSHOT:
+                msg = (
+                        'If should_save_final_snapshot returns True, then you'
+                        ' must call save_final_snapshot immediately.')
+            elif self._phase == APIPhase.BEFORE_SAVE_SNAPSHOT:
+                msg = (
+                        'If should_save_snapshot returns True, then you must'
+                        ' call save_snapshot first.')
+            else:
+                msg = (
+                        'Please only call should_save_final_snapshot at the'
+                        ' end of the reuse loop.')
+
+            raise RuntimeError(msg)
+
+    def should_save_final_snapshot_done(self, should_save: bool) -> None:
+        """Update phase on successful should_save_snapshot().
+
+        Args:
+            should_save: Whether we should save or not.
+        """
+        if should_save:
+            self._phase = APIPhase.BEFORE_SAVE_FINAL_SNAPSHOT
+        else:
+            self._phase = APIPhase.BEFORE_REUSE_INSTANCE
+
+    def verify_save_final_snapshot(self) -> None:
+        """Check should_save_final_snapshot()"""
+        if self._phase != APIPhase.BEFORE_SAVE_FINAL_SNAPSHOT:
+            raise RuntimeError()
+
+    def save_final_snapshot_done(self) -> None:
+        """Updates state on successful save_final_snapshot()"""
+        self._phase = APIPhase.BEFORE_REUSE_INSTANCE
diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py
index a4edf3be..88a561f8 100644
--- a/libmuscle/python/libmuscle/checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/checkpoint_triggers.py
@@ -1,7 +1,6 @@
 import bisect
 from datetime import datetime, timezone
 import logging
-import os
 import time
 from typing import List, Optional, Union
 
@@ -12,13 +11,6 @@
 _logger = logging.getLogger(__name__)
 
 
-def _checkpoint_error(description: str) -> None:
-    if "MUSCLE_DISABLE_CHECKPOINT_ERRORS" in os.environ:
-        _logger.warning(f"Suppressed checkpoint error: {description}")
-    else:
-        raise RuntimeError(description)
-
-
 class CheckpointTrigger:
     """Represents a trigger for creating snapshots"""
 
@@ -59,10 +51,7 @@ def __init__(self, at_rules: List[CheckpointAtRule]) -> None:
         Args:
             at: list of checkpoint moments
         """
-        self._at = []
-        for at_rule in at_rules:
-            self._at.extend(at_rule.at)
-        self._at.sort()
+        self._at = sorted([a for r in at_rules for a in r.at])
 
     def next_checkpoint(self, cur_time: float) -> Optional[float]:
         if cur_time >= self._at[-1]:
@@ -214,14 +203,6 @@ def set_checkpoint_info(
         self._nextsim = None        # type: Optional[float]
         self._sim_reset = True
 
-        self._first_reuse = True
-
-        # These attributes are only used to check if implementations are
-        # following the guidelines
-        self._should_have_saved = False
-        self._should_save_final_called = False
-        self._saved_final_checkpoint = False
-
     def elapsed_walltime(self) -> float:
         """Returns elapsed wallclock_time in seconds.
         """
@@ -238,12 +219,8 @@ def should_save_snapshot(self, timestamp: float) -> bool:
         if not self._has_checkpoints:
             return False
 
-        self.__check_should_have_saved()
-
         elapsed_walltime = self.elapsed_walltime()
-        value = self.__should_save(elapsed_walltime, timestamp)
-        self._should_have_saved = value
-        return value
+        return self.__should_save(elapsed_walltime, timestamp)
 
     def should_save_final_snapshot(
             self, do_reuse: bool, f_init_max_timestamp: Optional[float]
@@ -253,8 +230,6 @@ def should_save_final_snapshot(
         if not self._has_checkpoints:
             return False
 
-        self.__check_should_have_saved()
-
         value = False
         if not do_reuse and self._checkpoint_at_end:
             value = True
@@ -269,56 +244,15 @@ def should_save_final_snapshot(
             elapsed_walltime = self.elapsed_walltime()
             value = self.__should_save(elapsed_walltime, f_init_max_timestamp)
 
-        self._should_have_saved = value
-        self._should_save_final_called = True
         return value
 
-    @property
-    def save_final_snapshot_called(self) -> bool:
-        """Check if :meth:`save_final_snapshot` was called during this
-        reuse loop.
-        """
-        return self._saved_final_checkpoint
-
-    def reuse_instance(self) -> None:
-        """Cleanup between instance reuse
-        """
-        if not self._has_checkpoints:
-            return
-        if self._first_reuse:
-            self._first_reuse = False
-        else:
-            if self._should_have_saved:
-                _checkpoint_error('"should_save_snapshot" or '
-                                  '"should_save_final_snapshot" returned'
-                                  ' positive but no snapshot was saved before'
-                                  ' exiting the reuse loop.')
-            if not (self._should_save_final_called or self._saved_final_checkpoint):
-                _checkpoint_error('You must call "should_save_final" exactly'
-                                  ' once in the reuse loop of an instance that'
-                                  ' supports checkpointing.')
-        self._should_save_final_called = False
-        self._saved_final_checkpoint = False
-
-    def update_checkpoints(self, timestamp: float, final: bool) -> None:
+    def update_checkpoints(self, timestamp: float) -> None:
         """Update last and next checkpoint times when a snapshot is made.
 
         Args:
             timestamp: timestamp as reported by the instance (or from incoming
-                F_INIT messages when final=True).
-            final: True iff this is coming from a save_final_snapshot call.
+                F_INIT messages for save_final_snapshot).
         """
-        if not self._has_checkpoints:
-            _logger.info('Saving a snapshot but no checkpoints requested by the'
-                         ' workflow. Hint: use Instance.should_save_snapshot(),'
-                         ' Instance.should_save_final_snapshot() or'
-                         ' Instance.snapshots_enabled() to test if it is useful'
-                         ' to save a snapshot.')
-            return
-        if final and self._saved_final_checkpoint:
-            raise RuntimeError(
-                    'You may only save a final snapshot once per reuse loop.')
-
         self._prevwall = self.elapsed_walltime()
         self._nextwall = self._wall.next_checkpoint(self._prevwall)
 
@@ -328,8 +262,6 @@ def update_checkpoints(self, timestamp: float, final: bool) -> None:
         # this method is also called during resume, after which we no longer
         # consider the simulation_time as reset
         self._sim_reset = False
-        self._should_have_saved = False
-        self._saved_final_checkpoint = final
 
     def get_triggers(self) -> List[str]:
         """Get trigger description(s) for the current reason for checkpointing.
@@ -338,17 +270,6 @@ def get_triggers(self) -> List[str]:
         self._last_triggers = []
         return triggers
 
-    def __check_should_have_saved(self) -> None:
-        """Check if a snapshot is saved when required."""
-        if self._should_have_saved:
-            _checkpoint_error('"should_save_snapshot" or '
-                              '"should_save_final_snapshot" returned positive'
-                              ' but no snapshot was saved before the next call'
-                              ' to a should_save_ method.'
-                              ' You must call the corresponding save_snapshot'
-                              ' or save_final_snapshot method when should_save_'
-                              ' returns True.')
-
     def __should_save(self, walltime: float, simulation_time: float) -> bool:
         """Check if a checkpoint should be taken
 
diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index 245775e8..b7959b86 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -9,6 +9,7 @@
 from ymmsl import (Identifier, Operator, SettingValue, Port, Reference,
                    Settings, ImplementationState)
 
+from libmuscle.api_guard import APIGuard
 from libmuscle.communicator import Communicator, Message
 from libmuscle.settings_manager import SettingsManager
 from libmuscle.logging import LogLevel
@@ -66,6 +67,9 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None,
 
         self.__set_up_logging()
 
+        self._api_guard = APIGuard()
+        """Checks that the user uses the API correctly."""
+
         self._profiler = Profiler(self._instance_name(), self.__manager)
         """Profiler for this instance."""
 
@@ -143,6 +147,7 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool:
                 :meth:`should_save_final_snapshot` and
                 :meth:`save_final_snapshot`, or the checkpointing tutorial.
         """
+        self._api_guard.verify_reuse_instance()
         do_reuse = self._do_reuse
         if do_reuse is None:
             # should_save_final_snapshot not called, so we need to check_reuse
@@ -158,6 +163,7 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool:
             self._deregister()
             self.__manager.close()
 
+        self._api_guard.reuse_instance_done(do_reuse)
         return do_reuse
 
     def error_shutdown(self, message: str) -> None:
@@ -431,7 +437,10 @@ def resuming(self) -> bool:
             True iff the submodel must resume from a snapshot instead of the
             usual F_INIT step during this iteration of the reuse loop.
         """
-        return self._snapshot_manager.resuming()
+        self._api_guard.verify_resuming()
+        result = self._snapshot_manager.resuming()
+        self._api_guard.resuming_done(result)
+        return result
 
     def should_init(self) -> bool:
         """Check if this instance should initialize.
@@ -445,7 +454,10 @@ def should_init(self) -> bool:
         Returns:
             True if the submodel must execute the F_INIT step, False otherwise.
         """
-        return self._snapshot_manager.should_init()
+        self._api_guard.verify_should_init()
+        result = self._snapshot_manager.should_init()
+        self._api_guard.should_init_done()
+        return result
 
     def load_snapshot(self) -> Message:
         """Load a snapshot.
@@ -459,7 +471,10 @@ def load_snapshot(self) -> Message:
         Raises:
             RuntimeError: if not resuming from a snapshot.
         """
-        return self._snapshot_manager.load_snapshot()
+        self._api_guard.verify_load_snapshot()
+        result = self._snapshot_manager.load_snapshot()
+        self._api_guard.load_snapshot_done()
+        return result
 
     def should_save_snapshot(self, timestamp: float) -> bool:
         """Check if a snapshot should be saved after the S Operator of the
@@ -482,7 +497,10 @@ def should_save_snapshot(self, timestamp: float) -> bool:
             True iff a snapshot should be taken by the submodel according to the
             checkpoint rules provided in the ymmsl configuration.
         """
-        return self._snapshot_manager.should_save_snapshot(timestamp)
+        self._api_guard.verify_should_save_snapshot()
+        result = self._snapshot_manager.should_save_snapshot(timestamp)
+        self._api_guard.should_save_snapshot_done(result)
+        return result
 
     def save_snapshot(self, message: Message) -> None:
         """Save a snapshot after the S Operator of the submodel.
@@ -508,7 +526,9 @@ def save_snapshot(self, message: Message) -> None:
                 :meth:`should_save_snapshot`. The data attribute can be used to
                 store the internal state of the submodel.
         """
-        return self._snapshot_manager.save_snapshot(message)
+        self._api_guard.verify_save_snapshot()
+        self._snapshot_manager.save_snapshot(message)
+        self._api_guard.save_snapshot_done()
 
     def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool:
         """Check if a snapshot should be saved at the end of the reuse loop.
@@ -542,13 +562,12 @@ def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool:
             True iff a final snapshot should be taken by the submodel according
             to the checkpoint rules provided in the ymmsl configuration.
         """
-        if self._do_reuse is not None:
-            raise RuntimeError(
-                    'You may not call should_save_final_snapshot more than once'
-                    ' per reuse loop.')
+        self._api_guard.verify_should_save_final_snapshot()
         self._do_reuse = self.__check_reuse_instance(apply_overlay)
-        return self._snapshot_manager.should_save_final_snapshot(
+        result = self._snapshot_manager.should_save_final_snapshot(
                 self._do_reuse, self.__f_init_max_timestamp)
+        self._api_guard.should_save_final_snapshot_done(result)
+        return result
 
     def save_final_snapshot(self, message: Message) -> None:
         """Save a snapshot at the end of the reuse loop.
@@ -571,8 +590,10 @@ def save_final_snapshot(self, message: Message) -> None:
                 attribute can be used to store the internal state of the
                 submodel.
         """
-        return self._snapshot_manager.save_final_snapshot(
+        self._api_guard.verify_save_final_snapshot()
+        self._snapshot_manager.save_final_snapshot(
                 message, self.__f_init_max_timestamp)
+        self._api_guard.save_final_snapshot_done()
 
     @property
     def __f_init_max_timestamp(self) -> Optional[float]:
@@ -669,7 +690,7 @@ def __check_reuse_instance(self, apply_overlay: bool) -> bool:
         # TODO: _f_init_cache should be empty here, or the user didn't
         # receive something that was sent on the last go-around.
         # At least emit a warning.
-        if self.should_init() or not self._first_run:
+        if self._snapshot_manager.should_init() or not self._first_run:
             # self.should_init() might be False in first should_save_final(),
             # but self._first_run is already updated by then
             self.__pre_receive_f_init(apply_overlay)
@@ -684,8 +705,9 @@ def __check_reuse_instance(self, apply_overlay: bool) -> bool:
         no_settings_in = not self._communicator.settings_in_connected()
 
         if f_init_not_connected and no_settings_in:
-            do_reuse = self._first_run and (not self.resuming() or
-                                            not self.should_init())
+            do_reuse = self._first_run and (
+                    not self._snapshot_manager.resuming() or
+                    not self._snapshot_manager.should_init())
         else:
             for message in self._f_init_cache.values():
                 if isinstance(message.data, ClosePort):
diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py
index 0bd3de83..efd869f9 100644
--- a/libmuscle/python/libmuscle/snapshot_manager.py
+++ b/libmuscle/python/libmuscle/snapshot_manager.py
@@ -52,6 +52,8 @@ def __init__(self,
         self._resume_from_snapshot = None   # type: Optional[Snapshot]
         self._next_snapshot_num = 1
 
+        self._should_save_final_called = False
+
     def get_checkpoint_info(self) -> None:
         """Request checkpoint info from the muscle manager.
         """
@@ -78,8 +80,8 @@ def _set_checkpoint_info(self,
                 # snapshot.message is None for implicit snapshots
                 self._resume_from_snapshot = snapshot
                 self._trigger_manager.update_checkpoints(
-                    snapshot.message.timestamp,
-                    snapshot.is_final_snapshot)
+                    snapshot.message.timestamp)
+                self._should_save_final_called = snapshot.is_final_snapshot
             self._communicator.restore_message_counts(
                 snapshot.port_message_counts)
             # Store a copy of the snapshot in the current run directory
@@ -103,19 +105,19 @@ def reuse_instance(self,
         # Only create implicit snapshot if not already explicitly done
         # And not in the first reuse_instance()
         if (self._stateful is not ImplementationState.STATEFUL and
-                not self._trigger_manager.save_final_snapshot_called and
+                not self._should_save_final_called and
                 not self._first_reuse):
             if self.should_save_final_snapshot(do_reuse, f_init_max_timestamp):
                 # create an empty message object to store
                 self.__save_snapshot(None, True, f_init_max_timestamp)
 
-        self._trigger_manager.reuse_instance()
-
         if self._first_reuse:
             self._first_reuse = False
         else:
             self._resume_from_snapshot = None
 
+        self._should_save_final_called = False
+
     def snapshots_enabled(self) -> bool:
         """Check if the current workflow has snapshots enabled.
         """
@@ -140,10 +142,8 @@ def should_init(self) -> bool:
     def load_snapshot(self) -> Message:
         """Get the Message to resume from.
         """
-        if self._resume_from_snapshot is None:
-            raise RuntimeError('No snapshot to load. Use "instance.resuming()"'
-                               ' to check if a snapshot is available')
-        return cast(Message, self._resume_from_snapshot.message)
+        snapshot = cast(Snapshot, self._resume_from_snapshot)
+        return cast(Message, snapshot.message)
 
     def should_save_snapshot(self, timestamp: float) -> bool:
         """See :meth:`TriggerManager.should_save_snapshot`.
@@ -155,6 +155,7 @@ def should_save_final_snapshot(
             ) -> bool:
         """See :meth:`TriggerManager.should_save_final_snapshot`.
         """
+        self._should_save_final_called = True
         return self._trigger_manager.should_save_final_snapshot(
                 do_reuse, f_init_max_timestamp)
 
@@ -211,7 +212,7 @@ def __save_snapshot(
             # For final snapshots f_init_max_snapshot is the reference time (see
             # should_save_final_snapshot).
             timestamp = f_init_max_timestamp
-        self._trigger_manager.update_checkpoints(timestamp, final)
+        self._trigger_manager.update_checkpoints(timestamp)
 
     @staticmethod
     def load_snapshot_from_file(snapshot_location: Path) -> Snapshot:
diff --git a/libmuscle/python/libmuscle/test/conftest.py b/libmuscle/python/libmuscle/test/conftest.py
index a8d0ad72..77422ee9 100644
--- a/libmuscle/python/libmuscle/test/conftest.py
+++ b/libmuscle/python/libmuscle/test/conftest.py
@@ -3,6 +3,7 @@
 
 from ymmsl import Settings
 
+from libmuscle.api_guard import APIGuard
 from libmuscle.communicator import Message
 from libmuscle.mmp_client import MMPClient
 
@@ -21,3 +22,8 @@ def message() -> Message:
 @pytest.fixture
 def message2() -> Message:
     return Message(0.0, None, {'test': 17}, Settings())
+
+
+@pytest.fixture
+def guard() -> APIGuard:
+    return APIGuard()
diff --git a/libmuscle/python/libmuscle/test/test_api_guard.py b/libmuscle/python/libmuscle/test/test_api_guard.py
new file mode 100644
index 00000000..f67bde93
--- /dev/null
+++ b/libmuscle/python/libmuscle/test/test_api_guard.py
@@ -0,0 +1,154 @@
+from typing import Callable, Set
+
+import pytest
+
+from libmuscle.api_guard import APIGuard
+
+
+def test_no_checkpointing_support(guard):
+    for _ in range(3):
+        guard.verify_reuse_instance()
+        guard.reuse_instance_done(True)
+
+    assert not guard.uses_checkpointing()
+
+    guard.verify_reuse_instance()
+    guard.reuse_instance_done(False)
+
+    assert not guard.uses_checkpointing()
+
+
+def test_final_snapshot_only(guard):
+    for i in range(4):
+        guard.verify_reuse_instance()
+        guard.reuse_instance_done(True)
+
+        guard.verify_resuming()
+        if i == 0:
+            guard.resuming_done(True)
+
+            guard.verify_load_snapshot()
+            guard.load_snapshot_done()
+        else:
+            guard.resuming_done(False)
+
+        guard.verify_should_init()
+        guard.should_init_done()
+
+        guard.verify_should_save_final_snapshot()
+        if i == 2:
+            guard.should_save_final_snapshot_done(True)
+
+            guard.verify_save_final_snapshot()
+            guard.save_final_snapshot_done()
+        else:
+            guard.should_save_final_snapshot_done(False)
+
+    guard.verify_reuse_instance()
+    guard.reuse_instance_done(False)
+
+
+def test_full_checkpointing(guard):
+    for i in range(4):
+        guard.verify_reuse_instance()
+        guard.reuse_instance_done(True)
+
+        guard.verify_resuming()
+        if i == 0:
+            guard.resuming_done(True)
+
+            guard.verify_load_snapshot()
+            guard.load_snapshot_done()
+        else:
+            guard.resuming_done(False)
+
+        guard.verify_should_init()
+        guard.should_init_done()
+
+        for j in range(3):
+            guard.verify_should_save_snapshot()
+            if j != 2:
+                guard.should_save_snapshot_done(True)
+
+                guard.verify_save_snapshot()
+                guard.save_snapshot_done()
+            else:
+                guard.should_save_snapshot_done(False)
+
+        guard.verify_should_save_final_snapshot()
+        if i == 2:
+            guard.should_save_final_snapshot_done(True)
+
+            guard.verify_save_final_snapshot()
+            guard.save_final_snapshot_done()
+        else:
+            guard.should_save_final_snapshot_done(False)
+
+    guard.verify_reuse_instance()
+    guard.reuse_instance_done(False)
+
+
+_api_guard_funs = (
+    (APIGuard.verify_reuse_instance, ()),
+    (APIGuard.reuse_instance_done, (True,)),
+    (APIGuard.verify_resuming, ()),
+    (APIGuard.resuming_done, (True,)),
+    (APIGuard.verify_load_snapshot, ()),
+    (APIGuard.load_snapshot_done, ()),
+    (APIGuard.verify_should_init, ()),
+    (APIGuard.should_init_done, ()),
+    (APIGuard.verify_should_save_snapshot, ()),
+    (APIGuard.should_save_snapshot_done, (True,)),
+    (APIGuard.verify_save_snapshot, ()),
+    (APIGuard.save_snapshot_done, ()),
+    (APIGuard.verify_should_save_final_snapshot, ()),
+    (APIGuard.should_save_final_snapshot_done, (True,)),
+    (APIGuard.verify_save_final_snapshot, ())
+)
+
+
+def run_until_before(guard: APIGuard, excluded: Callable) -> None:
+    for fun, args in _api_guard_funs:
+        if fun is excluded:
+            break
+        fun(guard, *args)
+
+
+def check_all_raise_except(guard: APIGuard, excluded: Set[Callable]) -> None:
+    for fun, args in _api_guard_funs:
+        if fun.__name__.startswith('verify_'):
+            if fun not in excluded:
+                with pytest.raises(RuntimeError):
+                    fun(guard, *args)
+            else:
+                fun(guard, *args)
+
+
+@pytest.mark.parametrize('fun', [
+        APIGuard.verify_load_snapshot,
+        APIGuard.verify_should_init, APIGuard.verify_save_snapshot,
+        APIGuard.verify_save_final_snapshot])
+def test_missing_step(guard, fun):
+    run_until_before(guard, fun)
+    check_all_raise_except(guard, {fun})
+
+
+def test_missing_resuming(guard):
+    run_until_before(guard, APIGuard.verify_resuming)
+    check_all_raise_except(guard, {
+        APIGuard.verify_resuming, APIGuard.verify_reuse_instance})
+
+
+def test_missing_should_save_final(guard):
+    run_until_before(guard, APIGuard.verify_should_save_final_snapshot)
+    check_all_raise_except(guard, {
+        APIGuard.verify_should_save_snapshot,
+        APIGuard.verify_should_save_final_snapshot})
+
+
+def test_double_should_save(guard):
+    run_until_before(guard, APIGuard.verify_should_save_snapshot)
+    guard.verify_should_save_snapshot()
+    guard.should_save_snapshot_done(True)
+    with pytest.raises(RuntimeError):
+        guard.verify_should_save_snapshot()
diff --git a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
index 0cbf47b2..8200854f 100644
--- a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
@@ -1,5 +1,4 @@
 from datetime import datetime, timedelta, timezone
-import logging
 import time
 import pytest
 from ymmsl import CheckpointRangeRule, CheckpointAtRule, Checkpoints
@@ -156,15 +155,11 @@ def test_trigger_manager():
             wallclock_time=[CheckpointAtRule([1e-12])],
             simulation_time=[CheckpointAtRule([1, 3, 5])]))
 
-    trigger_manager.reuse_instance()
-
     assert trigger_manager.should_save_snapshot(0.1)
     triggers = trigger_manager.get_triggers()
     assert len(triggers) == 1
     assert "wallclock_time" in triggers[0]
-    with pytest.raises(RuntimeError):  # did not call save in between
-        trigger_manager.should_save_snapshot(0.1)
-    trigger_manager.update_checkpoints(0.1, False)
+    trigger_manager.update_checkpoints(0.1)
 
     assert not trigger_manager.should_save_snapshot(0.99)
 
@@ -172,46 +167,13 @@ def test_trigger_manager():
     triggers = trigger_manager.get_triggers()
     assert len(triggers) == 1
     assert "simulation_time" in triggers[0]
-    trigger_manager.update_checkpoints(3.2, False)
+    trigger_manager.update_checkpoints(3.2)
 
     assert trigger_manager.should_save_final_snapshot(True, 7.0)
-    with pytest.raises(RuntimeError):  # did not call save in between
-        trigger_manager.should_save_snapshot(4.0)
-    with pytest.raises(RuntimeError):  # did not call save in between
-        trigger_manager.should_save_final_snapshot(True, 7.0)
     assert len(trigger_manager.get_triggers()) > 0
-    trigger_manager.update_checkpoints(7.0, True)
-
-    trigger_manager.reuse_instance()
+    trigger_manager.update_checkpoints(7.0)
 
     assert not trigger_manager.should_save_snapshot(7.1)
-    with pytest.raises(RuntimeError):  # no should_save_final called
-        trigger_manager.reuse_instance()
 
     assert trigger_manager.should_save_final_snapshot(False, None)
-    with pytest.raises(RuntimeError):  # not saved
-        trigger_manager.reuse_instance()
-    trigger_manager.update_checkpoints(7.1, True)
-
-    trigger_manager.reuse_instance()
-
-
-def test_trigger_manager_warnings(caplog: pytest.LogCaptureFixture,
-                                  monkeypatch: pytest.MonkeyPatch):
-    monkeypatch.setenv("MUSCLE_DISABLE_CHECKPOINT_ERRORS", "1")
-
-    reference = datetime.now(timezone.utc)
-    trigger_manager = TriggerManager()
-    trigger_manager.set_checkpoint_info(reference, Checkpoints(
-            simulation_time=[CheckpointAtRule([1, 3, 5])]))
-
-    trigger_manager.reuse_instance()
-
-    with caplog.at_level(logging.WARN):
-        n_records = len(caplog.records)
-        assert trigger_manager.should_save_snapshot(1.5)
-        assert len(caplog.records) == n_records
-
-        trigger_manager.reuse_instance()  # suppressed error
-        assert len(caplog.records) > n_records
-        assert "Suppressed checkpoint error" in caplog.records[-1].message
+    trigger_manager.update_checkpoints(7.1)
diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
index ffec4744..b4121dac 100644
--- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py
+++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
@@ -1,9 +1,7 @@
 from datetime import datetime, timezone
-import logging
 from pathlib import Path
 from unittest.mock import MagicMock
 
-import pytest
 from ymmsl import (
         Reference, Checkpoints, CheckpointRangeRule, ImplementationState)
 
@@ -12,8 +10,7 @@
 from libmuscle.snapshot_manager import SnapshotManager
 
 
-def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path
-                          ) -> None:
+def test_no_checkpointing(tmp_path: Path) -> None:
     manager = MagicMock()
     communicator = MagicMock()
     communicator.get_message_counts.return_value = {}
@@ -31,11 +28,6 @@ def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path
     assert not snapshot_manager.should_save_snapshot(5000)
     assert not snapshot_manager.should_save_final_snapshot(False, None)
 
-    with caplog.at_level(logging.INFO, 'libmuscle'):
-        snapshot_manager.save_snapshot(Message(1.0, None, None))
-        assert caplog.records[0].levelname == "INFO"
-        assert "no checkpoints" in caplog.records[0].message
-
 
 def test_save_load_snapshot(tmp_path: Path) -> None:
     manager = MagicMock()
@@ -53,8 +45,6 @@ def test_save_load_snapshot(tmp_path: Path) -> None:
 
     assert not snapshot_manager.resuming()
     snapshot_manager.reuse_instance(True, None)
-    with pytest.raises(RuntimeError):
-        snapshot_manager.load_snapshot()
 
     assert not snapshot_manager.resuming()
     assert snapshot_manager.should_save_snapshot(0.2)

From c16a6cc5b57fb5bef24cb6817a8cc7db29792fe7 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Wed, 4 Jan 2023 10:13:25 +0100
Subject: [PATCH 118/183] Make only Instance decide how to run the reuse loop

---
 libmuscle/python/libmuscle/instance.py        | 156 ++++++++++++------
 .../python/libmuscle/snapshot_manager.py      |  75 +++------
 .../test/test_checkpoint_triggers.py          |  10 ++
 .../libmuscle/test/test_snapshot_manager.py   |  59 +++----
 4 files changed, 163 insertions(+), 137 deletions(-)

diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index b7959b86..247af77d 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -84,16 +84,27 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None,
         """Settings for this instance."""
 
         self._snapshot_manager = SnapshotManager(
-                self._instance_name(), self.__manager, self._communicator,
-                self._stateful)
+                self._instance_name(), self.__manager, self._communicator)
         """Keeps track of checkpointing and snapshots"""
 
-        self._first_run = True
-        """Keeps track of whether this is the first reuse run."""
+        self._first_run = None          # type: Optional[bool]
+        """Whether this is the first iteration of the reuse loop"""
+
         self._do_reuse = None           # type: Optional[bool]
-        """Caching variable for result from :meth:`__check_reuse_instance`"""
+        """Whether to enter this iteration of the reuse loop
+
+        This is None during the reuse loop, and set between
+        should_save_final_snapshot and reuse_instance.
+        """
+
+        self._do_resume = False
+        """Whether to resume on this iteration of the reuse loop"""
+
+        self._do_init = False
+        """Whether to do f_init on this iteration of the reuse loop"""
 
         self._f_init_cache = dict()     # type: _FInitCacheType
+        """Stores pre-received messages for f_init ports"""
 
         self._register()
         self._connect()
@@ -148,14 +159,27 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool:
                 :meth:`save_final_snapshot`, or the checkpointing tutorial.
         """
         self._api_guard.verify_reuse_instance()
-        do_reuse = self._do_reuse
-        if do_reuse is None:
-            # should_save_final_snapshot not called, so we need to check_reuse
-            do_reuse = self.__check_reuse_instance(apply_overlay)
-        self._do_reuse = None
 
-        self._snapshot_manager.reuse_instance(
-                do_reuse, self.__f_init_max_timestamp)
+        if self._do_reuse is not None:
+            # thank you, should_save_final_snapshot, for running this already
+            do_reuse = self._do_reuse
+            self._do_reuse = None
+        else:
+            do_reuse = self._decide_reuse_instance(apply_overlay)
+
+        # now _first_run, _do_resume and _do_init are also set correctly
+
+        do_implicit_checkpoint = (
+                not self._first_run and
+                not self._api_guard.uses_checkpointing() and
+                self._stateful is not ImplementationState.STATEFUL)
+
+        if do_implicit_checkpoint:
+            if self._snapshot_manager.should_save_final_snapshot(
+                    do_reuse, self.__f_init_max_timestamp):
+                # store a None instead of a Message
+                self._snapshot_manager.save_implicit_snapshot(
+                        self.__f_init_max_timestamp)
 
         if not do_reuse:
             self.__close_ports()
@@ -438,9 +462,8 @@ def resuming(self) -> bool:
             usual F_INIT step during this iteration of the reuse loop.
         """
         self._api_guard.verify_resuming()
-        result = self._snapshot_manager.resuming()
-        self._api_guard.resuming_done(result)
-        return result
+        self._api_guard.resuming_done(self._do_resume)
+        return self._do_resume
 
     def should_init(self) -> bool:
         """Check if this instance should initialize.
@@ -455,9 +478,8 @@ def should_init(self) -> bool:
             True if the submodel must execute the F_INIT step, False otherwise.
         """
         self._api_guard.verify_should_init()
-        result = self._snapshot_manager.should_init()
         self._api_guard.should_init_done()
-        return result
+        return self._do_init
 
     def load_snapshot(self) -> Message:
         """Load a snapshot.
@@ -563,7 +585,7 @@ def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool:
             to the checkpoint rules provided in the ymmsl configuration.
         """
         self._api_guard.verify_should_save_final_snapshot()
-        self._do_reuse = self.__check_reuse_instance(apply_overlay)
+        self._do_reuse = self._decide_reuse_instance(apply_overlay)
         result = self._snapshot_manager.should_save_final_snapshot(
                 self._do_reuse, self.__f_init_max_timestamp)
         self._api_guard.should_save_final_snapshot_done(result)
@@ -678,43 +700,51 @@ def __set_up_logging(self) -> None:
                                                      self.__manager)
             logging.getLogger().addHandler(self._mmp_handler)
 
-    def __check_reuse_instance(self, apply_overlay: bool) -> bool:
-        """Pre-receive F_INIT messages and detect if this instance is reused.
+    def _decide_reuse_instance(self, apply_overlay: bool) -> bool:
+        """Decide whether and how to reuse the instance.
 
-        This is called during :meth:`should_save_final_snapshot` to detect if a
-        snapshot must be taken. If an instance doesn't implement checkpointing,
-        :meth:`reuse_instance` will call it instead.
+        This sets self._first_run, self._do_resume and self._do_init, and
+        returns whether to reuse one more time. This is the real top of
+        the reuse loop, and it gets called by reuse_instance and
+        should_save_final_snapshot.
         """
-        do_reuse = self.__receive_settings()
-
-        # TODO: _f_init_cache should be empty here, or the user didn't
-        # receive something that was sent on the last go-around.
-        # At least emit a warning.
-        if self._snapshot_manager.should_init() or not self._first_run:
-            # self.should_init() might be False in first should_save_final(),
-            # but self._first_run is already updated by then
-            self.__pre_receive_f_init(apply_overlay)
-
-        self._set_local_log_level()
-        self._set_remote_log_level()
+        if self._first_run is None:
+            self._first_run = True
+        elif self._first_run:
+            self._first_run = False
+
+        # resume from intermediate
+        if self._first_run and self._snapshot_manager.resuming_from_intermediate():
+            self._do_resume = True
+            self._do_init = False
+            return True
+
+        f_init_connected = self._have_f_init_connections()
+
+        # resume from final
+        if self._first_run and self._snapshot_manager.resuming_from_final():
+            if f_init_connected:
+                got_f_init_messages = self._pre_receive(apply_overlay)
+                self._do_resume = True
+                self._do_init = True
+                return got_f_init_messages
+            else:
+                self._do_resume = False     # unused
+                self._do_init = False       # unused
+                return False
 
-        ports = self._communicator.list_ports()
-        f_init_not_connected = all(
-                [not self.is_connected(port)
-                 for port in ports.get(Operator.F_INIT, [])])
-        no_settings_in = not self._communicator.settings_in_connected()
+        # fresh start or resuming from implicit snapshot
+        self._do_resume = False
 
-        if f_init_not_connected and no_settings_in:
-            do_reuse = self._first_run and (
-                    not self._snapshot_manager.resuming() or
-                    not self._snapshot_manager.should_init())
-        else:
-            for message in self._f_init_cache.values():
-                if isinstance(message.data, ClosePort):
-                    do_reuse = False
-        self._first_run = False
+        # simple straight single run without resuming
+        if not f_init_connected:
+            self._do_init = self._first_run
+            return self._first_run
 
-        return do_reuse
+        # not resuming and f_init connected, run while we get messages
+        got_f_init_messages = self._pre_receive(apply_overlay)
+        self._do_init = got_f_init_messages
+        return got_f_init_messages
 
     def __receive_message(
             self, port_name: str, slot: Optional[int],
@@ -844,6 +874,32 @@ def __check_port(self, port_name: str) -> None:
             self.__shutdown(err_msg)
             raise RuntimeError(err_msg)
 
+    def _have_f_init_connections(self) -> bool:
+        """Checks whether we have connected F_INIT ports.
+
+        This includes muscle_settings_in, and any user-defined ports.
+        """
+        ports = self._communicator.list_ports()
+        f_init_connected = any(
+                [self.is_connected(port)
+                 for port in ports.get(Operator.F_INIT, [])])
+        return f_init_connected or self._communicator.settings_in_connected()
+
+    def _pre_receive(self, apply_overlay: bool) -> bool:
+        """Pre-receives on all ports.
+
+        This includes muscle_settings_in and all user-defined ports.
+
+        Returns:
+            True iff no ClosePort messages were received.
+        """
+        all_ports_open = self.__receive_settings()
+        self.__pre_receive_f_init(apply_overlay)
+        for message in self._f_init_cache.values():
+            if isinstance(message.data, ClosePort):
+                all_ports_open = False
+        return all_ports_open
+
     def __receive_settings(self) -> bool:
         """Receives settings on muscle_settings_in.
 
diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py
index efd869f9..c42a24c7 100644
--- a/libmuscle/python/libmuscle/snapshot_manager.py
+++ b/libmuscle/python/libmuscle/snapshot_manager.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 from typing import cast, Optional
 
-from ymmsl import Checkpoints, Reference, Operator, ImplementationState
+from ymmsl import Checkpoints, Reference, Operator
 
 from libmuscle.checkpoint_triggers import TriggerManager
 from libmuscle.communicator import Communicator, Message
@@ -30,8 +30,7 @@ class SnapshotManager:
     def __init__(self,
                  instance_id: Reference,
                  manager: MMPClient,
-                 communicator: Communicator,
-                 stateful: ImplementationState) -> None:
+                 communicator: Communicator) -> None:
         """Create a new snapshot manager
 
         Args:
@@ -45,15 +44,11 @@ def __init__(self,
         self._safe_id = str(instance_id).replace("[", "-").replace("]", "")
         self._communicator = communicator
         self._manager = manager
-        self._stateful = stateful
 
-        self._first_reuse = True
         self._trigger_manager = TriggerManager()
         self._resume_from_snapshot = None   # type: Optional[Snapshot]
         self._next_snapshot_num = 1
 
-        self._should_save_final_called = False
-
     def get_checkpoint_info(self) -> None:
         """Request checkpoint info from the muscle manager.
         """
@@ -81,7 +76,6 @@ def _set_checkpoint_info(self,
                 self._resume_from_snapshot = snapshot
                 self._trigger_manager.update_checkpoints(
                     snapshot.message.timestamp)
-                self._should_save_final_called = snapshot.is_final_snapshot
             self._communicator.restore_message_counts(
                 snapshot.port_message_counts)
             # Store a copy of the snapshot in the current run directory
@@ -89,54 +83,27 @@ def _set_checkpoint_info(self,
             metadata = SnapshotMetadata.from_snapshot(snapshot, str(path))
             self._manager.submit_snapshot_metadata(self._instance_id, metadata)
 
-    def reuse_instance(self,
-                       do_reuse: bool, f_init_max_timestamp: Optional[float]
-                       ) -> None:
-        """Callback on Instance.reuse_instance
-
-        Args:
-            snapshot_directory: Path to store this instance's snapshots in.
-            do_reuse: Used for implicit snapshots of stateless instances. See
-                :meth:`should_save_final_snapshot`.
-            f_init_max_timestamp: Used for implicit snapshots of stateless
-                instances. See :meth:`should_save_final_snapshot`.
-        """
-        # Implicit snapshots for stateless / weakly stateful instances
-        # Only create implicit snapshot if not already explicitly done
-        # And not in the first reuse_instance()
-        if (self._stateful is not ImplementationState.STATEFUL and
-                not self._should_save_final_called and
-                not self._first_reuse):
-            if self.should_save_final_snapshot(do_reuse, f_init_max_timestamp):
-                # create an empty message object to store
-                self.__save_snapshot(None, True, f_init_max_timestamp)
-
-        if self._first_reuse:
-            self._first_reuse = False
-        else:
-            self._resume_from_snapshot = None
-
-        self._should_save_final_called = False
-
     def snapshots_enabled(self) -> bool:
         """Check if the current workflow has snapshots enabled.
         """
         return self._trigger_manager.snapshots_enabled()
 
-    def resuming(self) -> bool:
-        """Check if we are resuming during this reuse iteration.
+    def resuming_from_intermediate(self) -> bool:
+        """Check whether we have an intermediate snapshot.
+        Doesn't say whether we should resume now, just that we were
+        given an intermediate snapshot to resume from by the manager.
         """
-        return self._resume_from_snapshot is not None
-
-    def should_init(self) -> bool:
-        """Check if F_INIT should be run in this reuse loop.
-
-        Returns:
-            True: when not resuming this reuse loop, or when resuming from a
-                final snapshot.
-            False: otherwise
-        """
-        return (self._resume_from_snapshot is None or
+        return (
+                self._resume_from_snapshot is not None and
+                not self._resume_from_snapshot.is_final_snapshot)
+
+    def resuming_from_final(self) -> bool:
+        """Check whether we have a final snapshot.
+        Doesn't say whether we should resume now, just that we were
+        given an intermediate snapshot to resume from by the manager.
+         """
+        return (
+                self._resume_from_snapshot is not None and
                 self._resume_from_snapshot.is_final_snapshot)
 
     def load_snapshot(self) -> Message:
@@ -155,7 +122,6 @@ def should_save_final_snapshot(
             ) -> bool:
         """See :meth:`TriggerManager.should_save_final_snapshot`.
         """
-        self._should_save_final_called = True
         return self._trigger_manager.should_save_final_snapshot(
                 do_reuse, f_init_max_timestamp)
 
@@ -174,6 +140,12 @@ def save_final_snapshot(
             raise ValueError(_NO_MESSAGE_PROVIDED.format('save_final_snapshot'))
         self.__save_snapshot(msg, True, f_init_max_timestamp)
 
+    def save_implicit_snapshot(
+            self, f_init_max_timestamp: Optional[float]) -> None:
+        """Save final snapshot without a message.
+        """
+        self.__save_snapshot(None, True, f_init_max_timestamp)
+
     def __save_snapshot(
             self, msg: Optional[Message], final: bool,
             f_init_max_timestamp: Optional[float] = None
@@ -183,6 +155,7 @@ def __save_snapshot(
         Args:
             msg: Message object representing the snapshot.
             final: True iff called from save_final_snapshot.
+            f_init_max_timestamp: Timestamp for final snapshots.
         """
         triggers = self._trigger_manager.get_triggers()
         wallclock_time = self._trigger_manager.elapsed_walltime()
diff --git a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
index 8200854f..388e6eca 100644
--- a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
@@ -177,3 +177,13 @@ def test_trigger_manager():
 
     assert trigger_manager.should_save_final_snapshot(False, None)
     trigger_manager.update_checkpoints(7.1)
+
+
+def test_no_checkpointing() -> None:
+    trigger_manager = TriggerManager()
+    trigger_manager.set_checkpoint_info(
+            datetime.now(timezone.utc), Checkpoints())
+
+    assert not trigger_manager.should_save_snapshot(1)
+    assert not trigger_manager.should_save_snapshot(5000)
+    assert not trigger_manager.should_save_final_snapshot(False, None)
diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
index b4121dac..fa976e14 100644
--- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py
+++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
@@ -2,8 +2,7 @@
 from pathlib import Path
 from unittest.mock import MagicMock
 
-from ymmsl import (
-        Reference, Checkpoints, CheckpointRangeRule, ImplementationState)
+from ymmsl import Reference, Checkpoints, CheckpointRangeRule
 
 from libmuscle.communicator import Message
 from libmuscle.snapshot import SnapshotMetadata
@@ -14,19 +13,13 @@ def test_no_checkpointing(tmp_path: Path) -> None:
     manager = MagicMock()
     communicator = MagicMock()
     communicator.get_message_counts.return_value = {}
-    snapshot_manager = SnapshotManager(
-            Reference('test'), manager, communicator,
-            ImplementationState.STATEFUL)
+    snapshot_manager = SnapshotManager(Reference('test'), manager, communicator)
 
     snapshot_manager._set_checkpoint_info(
             datetime.now(timezone.utc), Checkpoints(), None, tmp_path)
 
-    assert not snapshot_manager.resuming()
-    snapshot_manager.reuse_instance(True, None)
-    assert not snapshot_manager.resuming()
-    assert not snapshot_manager.should_save_snapshot(1)
-    assert not snapshot_manager.should_save_snapshot(5000)
-    assert not snapshot_manager.should_save_final_snapshot(False, None)
+    assert not snapshot_manager.resuming_from_intermediate()
+    assert not snapshot_manager.resuming_from_final()
 
 
 def test_save_load_snapshot(tmp_path: Path) -> None:
@@ -36,18 +29,15 @@ def test_save_load_snapshot(tmp_path: Path) -> None:
     communicator.get_message_counts.return_value = port_message_counts
 
     instance_id = Reference('test[1]')
-    snapshot_manager = SnapshotManager(
-            instance_id, manager, communicator, ImplementationState.STATEFUL)
+    snapshot_manager = SnapshotManager(instance_id, manager, communicator)
 
     checkpoints = Checkpoints(simulation_time=[CheckpointRangeRule(every=1)])
     snapshot_manager._set_checkpoint_info(
             datetime.now(timezone.utc), checkpoints, None, tmp_path)
 
-    assert not snapshot_manager.resuming()
-    snapshot_manager.reuse_instance(True, None)
-
-    assert not snapshot_manager.resuming()
     assert snapshot_manager.should_save_snapshot(0.2)
+    assert not snapshot_manager.resuming_from_intermediate()
+    assert not snapshot_manager.resuming_from_final()
     snapshot_manager.save_snapshot(Message(0.2, None, 'test data'))
 
     communicator.get_message_counts.assert_called_with()
@@ -65,16 +55,14 @@ def test_save_load_snapshot(tmp_path: Path) -> None:
     assert snapshot_path.parent == tmp_path
     assert snapshot_path.name == 'test-1_1.pack'
 
-    snapshot_manager2 = SnapshotManager(
-            instance_id, manager, communicator, ImplementationState.STATEFUL)
+    snapshot_manager2 = SnapshotManager(instance_id, manager, communicator)
 
     snapshot_manager2._set_checkpoint_info(
             datetime.now(timezone.utc), checkpoints, snapshot_path, tmp_path)
     communicator.restore_message_counts.assert_called_with(port_message_counts)
 
-    assert snapshot_manager2.resuming()
-    snapshot_manager2.reuse_instance(True, None)
-    assert snapshot_manager2.resuming()
+    assert snapshot_manager2.resuming_from_intermediate()
+    assert not snapshot_manager2.resuming_from_final()
     msg = snapshot_manager2.load_snapshot()
     assert msg.timestamp == 0.2
     assert msg.next_timestamp is None
@@ -98,9 +86,11 @@ def test_save_load_snapshot(tmp_path: Path) -> None:
     assert snapshot_path.parent == tmp_path
     assert snapshot_path.name == 'test-1_3.pack'
 
-    assert snapshot_manager2.resuming()
-    snapshot_manager2.reuse_instance(True, None)
-    assert not snapshot_manager2.resuming()
+    assert snapshot_manager2.resuming_from_intermediate()
+    assert not snapshot_manager2.resuming_from_final()
+    snapshot_manager2.load_snapshot()
+    assert snapshot_manager2.resuming_from_intermediate()
+    assert not snapshot_manager2.resuming_from_final()
 
 
 def test_save_load_implicit_snapshot(tmp_path: Path) -> None:
@@ -110,16 +100,15 @@ def test_save_load_implicit_snapshot(tmp_path: Path) -> None:
     communicator.get_message_counts.return_value = port_message_counts
 
     instance_id = Reference('test[1]')
-    snapshot_manager = SnapshotManager(
-            instance_id, manager, communicator, ImplementationState.STATELESS)
+    snapshot_manager = SnapshotManager(instance_id, manager, communicator)
 
     checkpoints = Checkpoints(simulation_time=[CheckpointRangeRule(every=1)])
     snapshot_manager._set_checkpoint_info(
             datetime.now(timezone.utc), checkpoints, None, tmp_path)
 
-    assert not snapshot_manager.resuming()
-    snapshot_manager.reuse_instance(True, None)
-    snapshot_manager.reuse_instance(True, 1.5)
+    assert not snapshot_manager.resuming_from_intermediate()
+    assert not snapshot_manager.resuming_from_final()
+    snapshot_manager.save_implicit_snapshot(1.5)
     manager.submit_snapshot_metadata.assert_called_once()
     instance, metadata = manager.submit_snapshot_metadata.call_args[0]
     assert instance == instance_id
@@ -127,8 +116,7 @@ def test_save_load_implicit_snapshot(tmp_path: Path) -> None:
     snapshot_path = Path(metadata.snapshot_filename)
     manager.submit_snapshot_metadata.reset_mock()
 
-    snapshot_manager2 = SnapshotManager(
-            instance_id, manager, communicator, ImplementationState.STATELESS)
+    snapshot_manager2 = SnapshotManager(instance_id, manager, communicator)
 
     snapshot_manager2._set_checkpoint_info(
             datetime.now(timezone.utc), checkpoints, snapshot_path, tmp_path)
@@ -136,8 +124,7 @@ def test_save_load_implicit_snapshot(tmp_path: Path) -> None:
     manager.submit_snapshot_metadata.assert_called_once()
     manager.submit_snapshot_metadata.reset_mock()
 
-    assert not snapshot_manager2.resuming()
-    snapshot_manager2.reuse_instance(True, 1.5)
-    assert not snapshot_manager2.resuming()
-    snapshot_manager2.reuse_instance(True, 2.5)
+    assert not snapshot_manager2.resuming_from_intermediate()
+    assert not snapshot_manager2.resuming_from_final()
+    snapshot_manager2.save_implicit_snapshot(2.5)
     manager.submit_snapshot_metadata.assert_called_once()

From 347eb284a4233ec8d85c3ee9ff4d9fb08b5b3e00 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Wed, 4 Jan 2023 18:43:25 +0100
Subject: [PATCH 119/183] Factor TriggerManager out of SnapshotManager

---
 libmuscle/python/libmuscle/instance.py        |  61 +++++++---
 .../python/libmuscle/snapshot_manager.py      | 108 ++++++------------
 .../libmuscle/test/test_snapshot_manager.py   |  47 +++-----
 3 files changed, 104 insertions(+), 112 deletions(-)

diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index 247af77d..a94ccb76 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -10,6 +10,7 @@
                    Settings, ImplementationState)
 
 from libmuscle.api_guard import APIGuard
+from libmuscle.checkpoint_triggers import TriggerManager
 from libmuscle.communicator import Communicator, Message
 from libmuscle.settings_manager import SettingsManager
 from libmuscle.logging import LogLevel
@@ -85,7 +86,10 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None,
 
         self._snapshot_manager = SnapshotManager(
                 self._instance_name(), self.__manager, self._communicator)
-        """Keeps track of checkpointing and snapshots"""
+        """Resumes, loads and saves snapshots."""
+
+        self._trigger_manager = TriggerManager()
+        """Keeps track of checkpoints and triggers snapshots."""
 
         self._first_run = None          # type: Optional[bool]
         """Whether this is the first iteration of the reuse loop"""
@@ -108,9 +112,22 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None,
 
         self._register()
         self._connect()
-        # Note: SnapshotManager.get_checkpoint_info needs to have the ports
-        # initialized so it comes after self._connect()
-        self._snapshot_manager.get_checkpoint_info()
+
+        # Note: get_checkpoint_info needs to have the ports initialized
+        # so it comes after self._connect()
+        checkpoint_info = self.__manager.get_checkpoint_info(
+                self._instance_name())
+
+        utc_reference, checkpoints = checkpoint_info[0:2]
+        self._trigger_manager.set_checkpoint_info(utc_reference, checkpoints)
+
+        resume_snapshot, snapshot_dir = checkpoint_info[2:4]
+        saved_at = self._snapshot_manager.prepare_resume(
+                resume_snapshot, snapshot_dir)
+
+        if saved_at is not None:
+            self._trigger_manager.update_checkpoints(saved_at)
+
         self._set_local_log_level()
         self._set_remote_log_level()
 
@@ -175,11 +192,10 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool:
                 self._stateful is not ImplementationState.STATEFUL)
 
         if do_implicit_checkpoint:
-            if self._snapshot_manager.should_save_final_snapshot(
+            if self._trigger_manager.should_save_final_snapshot(
                     do_reuse, self.__f_init_max_timestamp):
                 # store a None instead of a Message
-                self._snapshot_manager.save_implicit_snapshot(
-                        self.__f_init_max_timestamp)
+                self._save_snapshot(None, True, self.__f_init_max_timestamp)
 
         if not do_reuse:
             self.__close_ports()
@@ -443,7 +459,7 @@ def snapshots_enabled(self) -> bool:
         Returns:
             True iff checkpoint rules are defined in the workflow yMMSL.
         """
-        return self._snapshot_manager.snapshots_enabled()
+        return self._trigger_manager.snapshots_enabled()
 
     def resuming(self) -> bool:
         """Check if this instance is resuming from a snapshot.
@@ -520,7 +536,7 @@ def should_save_snapshot(self, timestamp: float) -> bool:
             checkpoint rules provided in the ymmsl configuration.
         """
         self._api_guard.verify_should_save_snapshot()
-        result = self._snapshot_manager.should_save_snapshot(timestamp)
+        result = self._trigger_manager.should_save_snapshot(timestamp)
         self._api_guard.should_save_snapshot_done(result)
         return result
 
@@ -549,7 +565,7 @@ def save_snapshot(self, message: Message) -> None:
                 store the internal state of the submodel.
         """
         self._api_guard.verify_save_snapshot()
-        self._snapshot_manager.save_snapshot(message)
+        self._save_snapshot(message, False)
         self._api_guard.save_snapshot_done()
 
     def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool:
@@ -585,9 +601,11 @@ def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool:
             to the checkpoint rules provided in the ymmsl configuration.
         """
         self._api_guard.verify_should_save_final_snapshot()
+
         self._do_reuse = self._decide_reuse_instance(apply_overlay)
-        result = self._snapshot_manager.should_save_final_snapshot(
+        result = self._trigger_manager.should_save_final_snapshot(
                 self._do_reuse, self.__f_init_max_timestamp)
+
         self._api_guard.should_save_final_snapshot_done(result)
         return result
 
@@ -613,8 +631,7 @@ def save_final_snapshot(self, message: Message) -> None:
                 submodel.
         """
         self._api_guard.verify_save_final_snapshot()
-        self._snapshot_manager.save_final_snapshot(
-                message, self.__f_init_max_timestamp)
+        self._save_snapshot(message, True, self.__f_init_max_timestamp)
         self._api_guard.save_final_snapshot_done()
 
     @property
@@ -746,6 +763,24 @@ def _decide_reuse_instance(self, apply_overlay: bool) -> bool:
         self._do_init = got_f_init_messages
         return got_f_init_messages
 
+    def _save_snapshot(
+            self, message: Optional[Message], final: bool,
+            f_init_max_timestamp: Optional[float] = None) -> None:
+        """Save a snapshot to disk and notify manager.
+
+        Args:
+            message: The data to save
+            final: Whether this is a final snapshot or an intermediate
+                one
+            f_init_max_timestamp: Timestamp for final snapshots
+        """
+        triggers = self._trigger_manager.get_triggers()
+        walltime = self._trigger_manager.elapsed_walltime()
+        timestamp = self._snapshot_manager.save_snapshot(
+                message, final, triggers, walltime,
+                f_init_max_timestamp)
+        self._trigger_manager.update_checkpoints(timestamp)
+
     def __receive_message(
             self, port_name: str, slot: Optional[int],
             default: Optional[Message], with_settings: bool
diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py
index c42a24c7..f756d05d 100644
--- a/libmuscle/python/libmuscle/snapshot_manager.py
+++ b/libmuscle/python/libmuscle/snapshot_manager.py
@@ -1,11 +1,9 @@
 import logging
-from datetime import datetime
 from pathlib import Path
-from typing import cast, Optional
+from typing import cast, List, Optional
 
-from ymmsl import Checkpoints, Reference, Operator
+from ymmsl import Reference, Operator
 
-from libmuscle.checkpoint_triggers import TriggerManager
 from libmuscle.communicator import Communicator, Message
 from libmuscle.mmp_client import MMPClient
 from libmuscle.snapshot import MsgPackSnapshot, Snapshot, SnapshotMetadata
@@ -45,37 +43,37 @@ def __init__(self,
         self._communicator = communicator
         self._manager = manager
 
-        self._trigger_manager = TriggerManager()
         self._resume_from_snapshot = None   # type: Optional[Snapshot]
         self._next_snapshot_num = 1
 
-    def get_checkpoint_info(self) -> None:
-        """Request checkpoint info from the muscle manager.
-        """
-        checkpoint_info = self._manager.get_checkpoint_info(self._instance_id)
-        self._set_checkpoint_info(*checkpoint_info)
-
-    def _set_checkpoint_info(self,
-                             utc_reference: datetime,
-                             checkpoints: Checkpoints,
-                             resume: Optional[Path],
-                             snapshot_directory: Optional[Path]) -> None:
+    def prepare_resume(
+            self, resume_snapshot: Optional[Path],
+            snapshot_directory: Optional[Path]) -> Optional[float]:
         """Apply checkpoint info received from the manager.
 
+        If there is a snapshot to resume from, this loads it and does
+        any resume work that libmuscle should do, including restoring
+        message counts and storing the resumed-from snapshot again as
+        our first snapshot.
+
         Args:
-            utc_reference: datetime (in UTC) indicating wallclock_time=0
-            checkpoints: requested workflow checkpoints
-            resume: previous snapshot to resume from (or None if not resuming)
+            resume_snapshot: Snapshot to resume from (or None if not
+                resuming)
+            snapshot_directory: directory to save snapshots in
+
+        Returns:
+            Time at which the initial snapshot was saved, if resuming.
         """
-        self._trigger_manager.set_checkpoint_info(utc_reference, checkpoints)
+        result = None       # type: Optional[float]
         self._snapshot_directory = snapshot_directory or Path.cwd()
-        if resume is not None:
-            snapshot = self.load_snapshot_from_file(resume)
+        if resume_snapshot is not None:
+            snapshot = self.load_snapshot_from_file(resume_snapshot)
+
             if snapshot.message is not None:
                 # snapshot.message is None for implicit snapshots
                 self._resume_from_snapshot = snapshot
-                self._trigger_manager.update_checkpoints(
-                    snapshot.message.timestamp)
+                result = snapshot.message.timestamp
+
             self._communicator.restore_message_counts(
                 snapshot.port_message_counts)
             # Store a copy of the snapshot in the current run directory
@@ -83,13 +81,11 @@ def _set_checkpoint_info(self,
             metadata = SnapshotMetadata.from_snapshot(snapshot, str(path))
             self._manager.submit_snapshot_metadata(self._instance_id, metadata)
 
-    def snapshots_enabled(self) -> bool:
-        """Check if the current workflow has snapshots enabled.
-        """
-        return self._trigger_manager.snapshots_enabled()
+        return result
 
     def resuming_from_intermediate(self) -> bool:
         """Check whether we have an intermediate snapshot.
+
         Doesn't say whether we should resume now, just that we were
         given an intermediate snapshot to resume from by the manager.
         """
@@ -99,9 +95,10 @@ def resuming_from_intermediate(self) -> bool:
 
     def resuming_from_final(self) -> bool:
         """Check whether we have a final snapshot.
+
         Doesn't say whether we should resume now, just that we were
         given an intermediate snapshot to resume from by the manager.
-         """
+        """
         return (
                 self._resume_from_snapshot is not None and
                 self._resume_from_snapshot.is_final_snapshot)
@@ -112,54 +109,23 @@ def load_snapshot(self) -> Message:
         snapshot = cast(Snapshot, self._resume_from_snapshot)
         return cast(Message, snapshot.message)
 
-    def should_save_snapshot(self, timestamp: float) -> bool:
-        """See :meth:`TriggerManager.should_save_snapshot`.
-        """
-        return self._trigger_manager.should_save_snapshot(timestamp)
-
-    def should_save_final_snapshot(
-            self, do_reuse: bool, f_init_max_timestamp: Optional[float]
-            ) -> bool:
-        """See :meth:`TriggerManager.should_save_final_snapshot`.
-        """
-        return self._trigger_manager.should_save_final_snapshot(
-                do_reuse, f_init_max_timestamp)
-
-    def save_snapshot(self, msg: Message) -> None:
-        """Save snapshot contained in the message object.
-        """
-        if not isinstance(msg, Message):
-            raise ValueError(_NO_MESSAGE_PROVIDED.format('save_snapshot'))
-        self.__save_snapshot(msg, False)
-
-    def save_final_snapshot(
-            self, msg: Message, f_init_max_timestamp: Optional[float]) -> None:
-        """Save final snapshot contained in the message object.
-        """
-        if not isinstance(msg, Message):
-            raise ValueError(_NO_MESSAGE_PROVIDED.format('save_final_snapshot'))
-        self.__save_snapshot(msg, True, f_init_max_timestamp)
-
-    def save_implicit_snapshot(
-            self, f_init_max_timestamp: Optional[float]) -> None:
-        """Save final snapshot without a message.
-        """
-        self.__save_snapshot(None, True, f_init_max_timestamp)
-
-    def __save_snapshot(
+    def save_snapshot(
             self, msg: Optional[Message], final: bool,
-            f_init_max_timestamp: Optional[float] = None
-            ) -> None:
-        """Actual implementation used by save_(final_)snapshot.
+            triggers: List[str], wallclock_time: float,
+            f_init_max_timestamp: Optional[float] = None,
+            ) -> float:
+        """Save a (final) snapshot.
 
         Args:
             msg: Message object representing the snapshot.
             final: True iff called from save_final_snapshot.
+            triggers: Description of checkpoints that triggered this.
+            wallclock_time: Wallclock time when saving.
             f_init_max_timestamp: Timestamp for final snapshots.
-        """
-        triggers = self._trigger_manager.get_triggers()
-        wallclock_time = self._trigger_manager.elapsed_walltime()
 
+        Returns:
+            Simulation time at which the snapshot was made
+        """
         port_message_counts = self._communicator.get_message_counts()
         if final:
             # Decrease F_INIT port counts by one: F_INIT messages are already
@@ -185,7 +151,7 @@ def __save_snapshot(
             # For final snapshots f_init_max_snapshot is the reference time (see
             # should_save_final_snapshot).
             timestamp = f_init_max_timestamp
-        self._trigger_manager.update_checkpoints(timestamp)
+        return timestamp
 
     @staticmethod
     def load_snapshot_from_file(snapshot_location: Path) -> Snapshot:
diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
index fa976e14..31423bb0 100644
--- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py
+++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
@@ -1,8 +1,7 @@
-from datetime import datetime, timezone
 from pathlib import Path
 from unittest.mock import MagicMock
 
-from ymmsl import Reference, Checkpoints, CheckpointRangeRule
+from ymmsl import Reference
 
 from libmuscle.communicator import Message
 from libmuscle.snapshot import SnapshotMetadata
@@ -15,9 +14,7 @@ def test_no_checkpointing(tmp_path: Path) -> None:
     communicator.get_message_counts.return_value = {}
     snapshot_manager = SnapshotManager(Reference('test'), manager, communicator)
 
-    snapshot_manager._set_checkpoint_info(
-            datetime.now(timezone.utc), Checkpoints(), None, tmp_path)
-
+    snapshot_manager.prepare_resume(None, tmp_path)
     assert not snapshot_manager.resuming_from_intermediate()
     assert not snapshot_manager.resuming_from_final()
 
@@ -31,22 +28,20 @@ def test_save_load_snapshot(tmp_path: Path) -> None:
     instance_id = Reference('test[1]')
     snapshot_manager = SnapshotManager(instance_id, manager, communicator)
 
-    checkpoints = Checkpoints(simulation_time=[CheckpointRangeRule(every=1)])
-    snapshot_manager._set_checkpoint_info(
-            datetime.now(timezone.utc), checkpoints, None, tmp_path)
-
-    assert snapshot_manager.should_save_snapshot(0.2)
+    snapshot_manager.prepare_resume(None, tmp_path)
     assert not snapshot_manager.resuming_from_intermediate()
     assert not snapshot_manager.resuming_from_final()
-    snapshot_manager.save_snapshot(Message(0.2, None, 'test data'))
+
+    snapshot_manager.save_snapshot(
+            Message(0.2, None, 'test data'), False, ['test'], 13.0)
 
     communicator.get_message_counts.assert_called_with()
     manager.submit_snapshot_metadata.assert_called()
     instance, metadata = manager.submit_snapshot_metadata.call_args[0]
     assert instance == instance_id
     assert isinstance(metadata, SnapshotMetadata)
-    assert metadata.triggers
-    assert metadata.wallclock_time > 0.0
+    assert metadata.triggers == ['test']
+    assert metadata.wallclock_time == 13.0
     assert metadata.timestamp == 0.2
     assert metadata.next_timestamp is None
     assert metadata.port_message_counts == port_message_counts
@@ -57,8 +52,7 @@ def test_save_load_snapshot(tmp_path: Path) -> None:
 
     snapshot_manager2 = SnapshotManager(instance_id, manager, communicator)
 
-    snapshot_manager2._set_checkpoint_info(
-            datetime.now(timezone.utc), checkpoints, snapshot_path, tmp_path)
+    snapshot_manager2.prepare_resume(snapshot_path, tmp_path)
     communicator.restore_message_counts.assert_called_with(port_message_counts)
 
     assert snapshot_manager2.resuming_from_intermediate()
@@ -68,16 +62,14 @@ def test_save_load_snapshot(tmp_path: Path) -> None:
     assert msg.next_timestamp is None
     assert msg.data == 'test data'
 
-    assert not snapshot_manager2.should_save_snapshot(0.4)
-    assert snapshot_manager2.should_save_final_snapshot(True, 1.2)
-    snapshot_manager2.save_final_snapshot(
-            Message(0.6, None, 'test data2'), 1.2)
+    snapshot_manager2.save_snapshot(
+            Message(0.6, None, 'test data2'), True, ['test'], 42.2, 1.2)
 
     instance, metadata = manager.submit_snapshot_metadata.call_args[0]
     assert instance == instance_id
     assert isinstance(metadata, SnapshotMetadata)
-    assert metadata.triggers
-    assert metadata.wallclock_time > 0.0
+    assert metadata.triggers == ['test']
+    assert metadata.wallclock_time == 42.2
     assert metadata.timestamp == 0.6
     assert metadata.next_timestamp is None
     assert metadata.port_message_counts == port_message_counts
@@ -102,13 +94,13 @@ def test_save_load_implicit_snapshot(tmp_path: Path) -> None:
     instance_id = Reference('test[1]')
     snapshot_manager = SnapshotManager(instance_id, manager, communicator)
 
-    checkpoints = Checkpoints(simulation_time=[CheckpointRangeRule(every=1)])
-    snapshot_manager._set_checkpoint_info(
-            datetime.now(timezone.utc), checkpoints, None, tmp_path)
+    snapshot_manager.prepare_resume(None, tmp_path)
 
     assert not snapshot_manager.resuming_from_intermediate()
     assert not snapshot_manager.resuming_from_final()
-    snapshot_manager.save_implicit_snapshot(1.5)
+    # save implicit snapshot
+    snapshot_manager.save_snapshot(None, True, ['implicit'], 1.0, 1.5)
+
     manager.submit_snapshot_metadata.assert_called_once()
     instance, metadata = manager.submit_snapshot_metadata.call_args[0]
     assert instance == instance_id
@@ -118,13 +110,12 @@ def test_save_load_implicit_snapshot(tmp_path: Path) -> None:
 
     snapshot_manager2 = SnapshotManager(instance_id, manager, communicator)
 
-    snapshot_manager2._set_checkpoint_info(
-            datetime.now(timezone.utc), checkpoints, snapshot_path, tmp_path)
+    snapshot_manager2.prepare_resume(snapshot_path, tmp_path)
     communicator.restore_message_counts.assert_called_with(port_message_counts)
     manager.submit_snapshot_metadata.assert_called_once()
     manager.submit_snapshot_metadata.reset_mock()
 
     assert not snapshot_manager2.resuming_from_intermediate()
     assert not snapshot_manager2.resuming_from_final()
-    snapshot_manager2.save_implicit_snapshot(2.5)
+    snapshot_manager2.save_snapshot(None, True, ['implicit'], 12.3, 2.5)
     manager.submit_snapshot_metadata.assert_called_once()

From 57b618fc18c710cefadc10fb00c14c142f1b44a0 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Thu, 5 Jan 2023 19:47:08 +0100
Subject: [PATCH 120/183] Improve dealing with wall-clock time checkpoints

---
 integration_test/test_cpp_mpp_client.py       |  2 +-
 libmuscle/cpp/src/libmuscle/communicator.cpp  |  3 +-
 libmuscle/cpp/src/libmuscle/mpp_message.cpp   |  5 +-
 libmuscle/cpp/src/libmuscle/mpp_message.hpp   |  3 +-
 .../libmuscle/tests/mocks/mock_mpp_client.cpp |  3 +-
 .../tests/mocks/mock_post_office.cpp          |  4 +-
 .../tests/tcp_transport_server_test.cpp       |  2 +-
 .../src/libmuscle/tests/test_mpp_message.cpp  |  8 ++-
 .../cpp/src/libmuscle/tests/test_outbox.cpp   |  2 +-
 .../src/libmuscle/tests/test_post_office.cpp  |  2 +-
 .../tests/test_tcp_communication.cpp          |  3 +-
 .../python/libmuscle/checkpoint_triggers.py   | 46 ++++++++--------
 libmuscle/python/libmuscle/communicator.py    | 39 +++++++------
 libmuscle/python/libmuscle/instance.py        | 19 +++++--
 .../python/libmuscle/manager/mmp_server.py    |  7 +--
 .../manager/test/test_mmp_request_handler.py  |  6 +-
 libmuscle/python/libmuscle/mmp_client.py      | 19 +++----
 libmuscle/python/libmuscle/mpp_message.py     | 11 +++-
 libmuscle/python/libmuscle/snapshot.py        |  2 +-
 .../test/test_checkpoint_triggers.py          | 19 +++----
 .../libmuscle/test/test_communicator.py       | 55 +++++++++++--------
 .../python/libmuscle/test/test_instance.py    | 28 +++++-----
 .../python/libmuscle/test/test_mpp_message.py | 12 ++--
 .../python/libmuscle/test/test_outbox.py      |  2 +-
 24 files changed, 167 insertions(+), 135 deletions(-)

diff --git a/integration_test/test_cpp_mpp_client.py b/integration_test/test_cpp_mpp_client.py
index 7541993e..976dc41c 100644
--- a/integration_test/test_cpp_mpp_client.py
+++ b/integration_test/test_cpp_mpp_client.py
@@ -23,7 +23,7 @@ def tcp_server_process(control_pipe):
     message = MPPMessage(
             Reference('test_sender.test_port'),
             receiver,
-            10, 1.0, 2.0, settings, 0, data).encoded()
+            10, 1.0, 2.0, settings, 0, 1.0, data).encoded()
 
     def handle_request(request_bytes):
         request = msgpack.unpackb(request_bytes, raw=False)
diff --git a/libmuscle/cpp/src/libmuscle/communicator.cpp b/libmuscle/cpp/src/libmuscle/communicator.cpp
index 269a1139..644d67ae 100644
--- a/libmuscle/cpp/src/libmuscle/communicator.cpp
+++ b/libmuscle/cpp/src/libmuscle/communicator.cpp
@@ -133,7 +133,8 @@ void Communicator::send_message(
         MPPMessage mpp_message(
                 snd_endpoint.ref(), recv_endpoint.ref(),
                 port_length, message.timestamp(), Optional<double>(),
-                settings_overlay, port.get_num_messages(slot), message.data());
+                settings_overlay, port.get_num_messages(slot), -1.0,
+                message.data());
 
         if (message.has_next_timestamp())
             mpp_message.next_timestamp = message.next_timestamp();
diff --git a/libmuscle/cpp/src/libmuscle/mpp_message.cpp b/libmuscle/cpp/src/libmuscle/mpp_message.cpp
index bf1be0f0..344adb23 100644
--- a/libmuscle/cpp/src/libmuscle/mpp_message.cpp
+++ b/libmuscle/cpp/src/libmuscle/mpp_message.cpp
@@ -16,7 +16,7 @@ MPPMessage::MPPMessage(
             ::libmuscle::impl::Optional<int> port_length,
             double timestamp, ::libmuscle::impl::Optional<double> next_timestamp,
             DataConstRef const & settings_overlay,
-            int message_number,
+            int message_number, double saved_until,
             DataConstRef const & data
             )
         : sender(sender)
@@ -26,6 +26,7 @@ MPPMessage::MPPMessage(
         , next_timestamp(next_timestamp)
         , settings_overlay(settings_overlay)
         , message_number(message_number)
+        , saved_until(saved_until)
         , data(data)
     {}
 
@@ -51,6 +52,7 @@ MPPMessage MPPMessage::from_bytes(DataConstRef const & data) {
             next_timestamp,
             dict["settings_overlay"],
             dict["message_number"].as<int>(),
+            dict["saved_until"].as<double>(),
             dict["data"]);
 }
 
@@ -71,6 +73,7 @@ DataConstRef MPPMessage::encoded() const {
             "next_timestamp", next_timestamp_data,
             "settings_overlay", settings_overlay,
             "message_number", message_number,
+            "saved_until", saved_until,
             "data", data
             );
 
diff --git a/libmuscle/cpp/src/libmuscle/mpp_message.hpp b/libmuscle/cpp/src/libmuscle/mpp_message.hpp
index 96a26fe0..69f15645 100644
--- a/libmuscle/cpp/src/libmuscle/mpp_message.hpp
+++ b/libmuscle/cpp/src/libmuscle/mpp_message.hpp
@@ -33,7 +33,7 @@ struct MPPMessage {
             ::libmuscle::impl::Optional<int> port_length,
             double timestamp, ::libmuscle::impl::Optional<double> next_timestamp,
             DataConstRef const & settings_overlay, int message_number,
-            DataConstRef const & data);
+            double saved_until, DataConstRef const & data);
 
     /** Create an MCP Message from an encoded buffer.
      *
@@ -54,6 +54,7 @@ struct MPPMessage {
     ::libmuscle::impl::Optional<double> next_timestamp;
     DataConstRef settings_overlay;
     int message_number;
+    double saved_until;
     DataConstRef data;
 };
 
diff --git a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mpp_client.cpp b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mpp_client.cpp
index 55ae3a76..31507a44 100644
--- a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mpp_client.cpp
+++ b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mpp_client.cpp
@@ -44,7 +44,8 @@ Settings MockMPPClient::make_overlay_() {
 }
 
 MPPMessage MockMPPClient::next_receive_message(
-        "test.out", "test2.in", 0, 0.0, 1.0, make_overlay_(), 0, Data::dict("test1", 12));
+        "test.out", "test2.in", 0, 0.0, 1.0, make_overlay_(),0, 9.0,
+        Data::dict("test1", 12));
 
 Reference MockMPPClient::last_receiver("_none");
 
diff --git a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_post_office.cpp b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_post_office.cpp
index 6d2bb3cc..cb6a92d5 100644
--- a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_post_office.cpp
+++ b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_post_office.cpp
@@ -6,13 +6,13 @@ int MockPostOffice::handle_request(
         char const * res_buf, std::size_t res_len,
         std::unique_ptr<DataConstRef> & response) {
     response = std::make_unique<DataConstRef>(
-            MPPMessage("test.out", "test2.in", 0, 0.0, 1.0, Data(), 0, Data()).encoded());
+            MPPMessage("test.out", "test2.in", 0, 0.0, 1.0, Data(), 0, 0.0, Data()).encoded());
     return -1;
 }
 
 std::unique_ptr<DataConstRef> MockPostOffice::get_response(int fd) {
     return std::make_unique<DataConstRef>(
-            MPPMessage("test.out", "test2.in", 0, 0.0, 1.0, Data(), 0, Data()).encoded());
+            MPPMessage("test.out", "test2.in", 0, 0.0, 1.0, Data(), 0, 8.0, Data()).encoded());
 }
 
 void MockPostOffice::deposit(
diff --git a/libmuscle/cpp/src/libmuscle/tests/tcp_transport_server_test.cpp b/libmuscle/cpp/src/libmuscle/tests/tcp_transport_server_test.cpp
index 248f597f..7082a0bc 100644
--- a/libmuscle/cpp/src/libmuscle/tests/tcp_transport_server_test.cpp
+++ b/libmuscle/cpp/src/libmuscle/tests/tcp_transport_server_test.cpp
@@ -38,7 +38,7 @@ int main(int argc, char *argv[]) {
             "test_sender.port", receiver, 10,
             0.0, 1.0,
             overlay_settings,
-            0,
+            0, 6.0,
             data_dict);
     auto msg_data = std::make_unique<DataConstRef>(msg.encoded());
     post_office.deposit(receiver, std::move(msg_data));
diff --git a/libmuscle/cpp/src/libmuscle/tests/test_mpp_message.cpp b/libmuscle/cpp/src/libmuscle/tests/test_mpp_message.cpp
index 53f2ed28..cebcedd5 100644
--- a/libmuscle/cpp/src/libmuscle/tests/test_mpp_message.cpp
+++ b/libmuscle/cpp/src/libmuscle/tests/test_mpp_message.cpp
@@ -29,7 +29,7 @@ TEST(test_mcp_message, create_mcp_message) {
             Reference("sender.port"), Reference("receiver.port"),
             10,
             100.1, 101.0,
-            test, 0, abc
+            test, 0, 1.0, abc
             );
 
     ASSERT_EQ(m.sender, "sender.port");
@@ -39,6 +39,7 @@ TEST(test_mcp_message, create_mcp_message) {
     ASSERT_EQ(m.next_timestamp, 101.0);
     ASSERT_EQ(m.settings_overlay.as<std::string>(), "test");
     ASSERT_EQ(m.message_number, 0);
+    ASSERT_EQ(m.saved_until, 1.0);
     ASSERT_EQ(m.data.as<std::string>(), "abc");
 }
 
@@ -49,7 +50,7 @@ TEST(test_mcp_message, create_mcp_message_minimal) {
             Reference("sender.port"), Reference("receiver.port"),
             {},
             100.1, {},
-            test, 0, abc
+            test, 0, 2.0, abc
             );
 
     ASSERT_EQ(m.sender, "sender.port");
@@ -59,6 +60,7 @@ TEST(test_mcp_message, create_mcp_message_minimal) {
     ASSERT_FALSE(m.next_timestamp.is_set());
     ASSERT_TRUE(m.settings_overlay.is_nil());
     ASSERT_EQ(m.message_number, 0);
+    ASSERT_EQ(m.saved_until, 2.0);
     ASSERT_TRUE(m.data.is_nil());
 }
 
@@ -71,6 +73,7 @@ TEST(test_mcp_message, from_bytes) {
             "next_timestamp", Data(),
             "settings_overlay", Data(),
             "message_number", 0,
+            "saved_until", 3.0,
             "data", Data()
             );
 
@@ -88,6 +91,7 @@ TEST(test_mcp_message, from_bytes) {
     ASSERT_FALSE(m.next_timestamp.is_set());
     ASSERT_TRUE(m.settings_overlay.is_nil());
     ASSERT_EQ(m.message_number, 0);
+    ASSERT_EQ(m.saved_until, 3.0);
     ASSERT_TRUE(m.data.is_nil());
 }
 
diff --git a/libmuscle/cpp/src/libmuscle/tests/test_outbox.cpp b/libmuscle/cpp/src/libmuscle/tests/test_outbox.cpp
index 0d6769c5..e98bd423 100644
--- a/libmuscle/cpp/src/libmuscle/tests/test_outbox.cpp
+++ b/libmuscle/cpp/src/libmuscle/tests/test_outbox.cpp
@@ -36,7 +36,7 @@ TEST(libmuscle_outbox, test_deposit_retrieve_message) {
             Optional<int>(),
             0.0, 1.0,
             DataConstRef(),
-            0,
+            0, 1.0,
             DataConstRef("testing"));
 
     auto message_data = std::make_unique<DataConstRef>(message.encoded());
diff --git a/libmuscle/cpp/src/libmuscle/tests/test_post_office.cpp b/libmuscle/cpp/src/libmuscle/tests/test_post_office.cpp
index f6cf05c2..bf6981d8 100644
--- a/libmuscle/cpp/src/libmuscle/tests/test_post_office.cpp
+++ b/libmuscle/cpp/src/libmuscle/tests/test_post_office.cpp
@@ -36,7 +36,7 @@ std::unique_ptr<DataConstRef> make_message() {
             "test_sender.port", "test_receiver.port",
             Optional<int>(),
             0.0, 1.0,
-            DataConstRef(), 0, DataConstRef());
+            DataConstRef(), 0, 5.0, DataConstRef());
     return std::make_unique<DataConstRef>(msg.encoded());
 }
 
diff --git a/libmuscle/cpp/src/libmuscle/tests/test_tcp_communication.cpp b/libmuscle/cpp/src/libmuscle/tests/test_tcp_communication.cpp
index c6400404..2d152161 100644
--- a/libmuscle/cpp/src/libmuscle/tests/test_tcp_communication.cpp
+++ b/libmuscle/cpp/src/libmuscle/tests/test_tcp_communication.cpp
@@ -36,7 +36,7 @@ TEST(test_tcp_communication, send_receive) {
     MPPMessage msg(
             "test_sender.port", receiver, 10,
             0.0, 1.0,
-            Data::dict("par1", 13), 1,
+            Data::dict("par1", 13), 1, 4.0,
             Data::dict("var1", 1, "var2", 2.0, "var3", "3"));
     auto msg_data = std::make_unique<DataConstRef>(msg.encoded());
     post_office.deposit(receiver, std::move(msg_data));
@@ -54,6 +54,7 @@ TEST(test_tcp_communication, send_receive) {
     ASSERT_EQ(m.next_timestamp, 1.0);
     ASSERT_EQ(m.settings_overlay["par1"].as<int>(), 13);
     ASSERT_EQ(m.message_number, 1);
+    ASSERT_EQ(m.saved_until, 4.0);
     ASSERT_EQ(m.data["var1"].as<int>(), 1);
     ASSERT_EQ(m.data["var2"].as<double>(), 2.0);
     ASSERT_EQ(m.data["var3"].as<std::string>(), "3");
diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py
index 88a561f8..7ea2759d 100644
--- a/libmuscle/python/libmuscle/checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/checkpoint_triggers.py
@@ -1,5 +1,4 @@
 import bisect
-from datetime import datetime, timezone
 import logging
 import time
 from typing import List, Optional, Union
@@ -160,18 +159,6 @@ def previous_checkpoint(self, cur_time: float) -> Optional[float]:
                    default=None)  # return None if all triggers return None
 
 
-def _utc_to_monotonic(utc: datetime) -> float:
-    """Convert UTC time point to a reference value of time.monotonic()
-
-    Args:
-        utc: datetime in UTC timezone
-    """
-    curmono = time.monotonic()
-    curutc = datetime.now(timezone.utc)
-    elapsed_seconds = (curutc - utc).total_seconds()
-    return curmono - elapsed_seconds
-
-
 class TriggerManager:
     """Manages all checkpoint triggers and checks if a snapshot must be saved.
     """
@@ -179,18 +166,19 @@ class TriggerManager:
     def __init__(self) -> None:
         self._has_checkpoints = False
         self._last_triggers = []    # type: List[str]
-        self._monotonic_reference = time.monotonic()
+        self._cpts_considered_until = float('-inf')
 
     def set_checkpoint_info(
-            self, utc_reference: datetime, checkpoints: Checkpoints) -> None:
+            self, elapsed: float, checkpoints: Checkpoints) -> None:
         """Register checkpoint info received from the muscle manager.
         """
+        self._mono_to_elapsed = elapsed - time.monotonic()
+
         if not checkpoints:
             self._has_checkpoints = False
             return
 
         self._has_checkpoints = True
-        self._monotonic_reference = _utc_to_monotonic(utc_reference)
 
         self._checkpoint_at_end = checkpoints.at_end
 
@@ -206,7 +194,19 @@ def set_checkpoint_info(
     def elapsed_walltime(self) -> float:
         """Returns elapsed wallclock_time in seconds.
         """
-        return time.monotonic() - self._monotonic_reference
+        return time.monotonic() + self._mono_to_elapsed
+
+    def checkpoints_considered_until(self) -> float:
+        """Return elapsed time of last should_save*
+        """
+        return self._cpts_considered_until
+
+    def harmonise_wall_time(self, at_least: float) -> None:
+        """Ensure our elapsed time is at least the given value
+        """
+        cur = self.elapsed_walltime()
+        if cur < at_least:
+            self._mono_to_elapsed += at_least - cur
 
     def snapshots_enabled(self) -> bool:
         """Check if the current workflow has snapshots enabled.
@@ -219,8 +219,7 @@ def should_save_snapshot(self, timestamp: float) -> bool:
         if not self._has_checkpoints:
             return False
 
-        elapsed_walltime = self.elapsed_walltime()
-        return self.__should_save(elapsed_walltime, timestamp)
+        return self.__should_save(timestamp)
 
     def should_save_final_snapshot(
             self, do_reuse: bool, f_init_max_timestamp: Optional[float]
@@ -241,8 +240,7 @@ def should_save_final_snapshot(
                           ' Not creating a snapshot.')
             self._sim_reset = True
         else:
-            elapsed_walltime = self.elapsed_walltime()
-            value = self.__should_save(elapsed_walltime, f_init_max_timestamp)
+            value = self.__should_save(f_init_max_timestamp)
 
         return value
 
@@ -270,11 +268,10 @@ def get_triggers(self) -> List[str]:
         self._last_triggers = []
         return triggers
 
-    def __should_save(self, walltime: float, simulation_time: float) -> bool:
+    def __should_save(self, simulation_time: float) -> bool:
         """Check if a checkpoint should be taken
 
         Args:
-            walltime: current wallclock time (elapsed since reference)
             simulation_time: current/next timestamp as reported by the instance
         """
         if self._sim_reset:
@@ -290,6 +287,9 @@ def __should_save(self, walltime: float, simulation_time: float) -> bool:
                 self._nextsim = self._sim.next_checkpoint(simulation_time)
             self._sim_reset = False
 
+        walltime = self.elapsed_walltime()
+        self._cpts_considered_until = walltime
+
         self._last_triggers = []
         if self._nextwall is not None and walltime >= self._nextwall:
             self._last_triggers.append(f"wallclock_time >= {self._nextwall}")
diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py
index bf1cf33e..69272f78 100644
--- a/libmuscle/python/libmuscle/communicator.py
+++ b/libmuscle/python/libmuscle/communicator.py
@@ -181,7 +181,8 @@ def get_port(self, port_name: str) -> Port:
 
     def send_message(
             self, port_name: str, message: Message,
-            slot: Optional[int] = None) -> None:
+            slot: Optional[int] = None,
+            checkpoints_considered_until: float = float('-inf')) -> None:
         """Send a message and settings to the outside world.
 
         Sending is non-blocking, a copy of the message will be made
@@ -191,6 +192,8 @@ def send_message(
             port_name: The port on which this message is to be sent.
             message: The message to be sent.
             slot: The slot to send the message on, if any.
+            checkpoints_considered_until: When we last checked if we
+                should save a snapshot (wallclock time).
         """
         if slot is None:
             _logger.debug('Sending message on {}'.format(port_name))
@@ -227,6 +230,7 @@ def send_message(
                                      message.timestamp, message.next_timestamp,
                                      cast(Settings, message.settings),
                                      port.get_num_messages(slot),
+                                     checkpoints_considered_until,
                                      message.data)
             encoded_message = mcp_message.encoded()
             self._post_office.deposit(recv_endpoint.ref(), encoded_message)
@@ -240,7 +244,7 @@ def send_message(
 
     def receive_message(self, port_name: str, slot: Optional[int] = None,
                         default: Optional[Message] = None
-                        ) -> Message:
+                        ) -> Tuple[Message, float]:
         """Receive a message and attached settings overlay.
 
         Receiving is a blocking operation. This function will contact
@@ -260,7 +264,8 @@ def receive_message(self, port_name: str, slot: Optional[int] = None,
         Returns:
             The received message, with message.settings holding
             the settings overlay. The settings attribute is
-            guaranteed to not be None.
+            guaranteed to not be None. Secondly, the saved_until
+            metadata field from the received message.
 
         Raises:
             RuntimeError: If no default was given and the port is not
@@ -286,7 +291,7 @@ def receive_message(self, port_name: str, slot: Optional[int] = None,
             _logger.debug(
                     'No message received on {} as it is not connected'.format(
                         port_name))
-            return default
+            return default, float('-inf')
 
         if port_name in self._ports:
             port = self._ports[port_name]
@@ -304,28 +309,28 @@ def receive_message(self, port_name: str, slot: Optional[int] = None,
         snd_endpoint = self._peer_manager.get_peer_endpoints(
                 recv_endpoint.port, slot_list)[0]
         client = self.__get_client(snd_endpoint.instance())
-        mcp_message_bytes = client.receive(recv_endpoint.ref())
-        mcp_message = MPPMessage.from_bytes(mcp_message_bytes)
+        mpp_message_bytes = client.receive(recv_endpoint.ref())
+        mpp_message = MPPMessage.from_bytes(mpp_message_bytes)
 
-        if mcp_message.port_length is not None:
+        if mpp_message.port_length is not None:
             if port.is_resizable():
-                port.set_length(mcp_message.port_length)
+                port.set_length(mpp_message.port_length)
 
-        if isinstance(mcp_message.data, ClosePort):
+        if isinstance(mpp_message.data, ClosePort):
             port.set_closed(slot)
 
         message = Message(
-                mcp_message.timestamp, mcp_message.next_timestamp,
-                mcp_message.data, mcp_message.settings_overlay)
+                mpp_message.timestamp, mpp_message.next_timestamp,
+                mpp_message.data, mpp_message.settings_overlay)
 
         profile_event.stop()
         if port.is_vector():
             profile_event.port_length = port.get_length()
-        profile_event.message_size = len(mcp_message_bytes)
+        profile_event.message_size = len(mpp_message_bytes)
 
         expected_message_number = port.get_num_messages(slot)
-        if expected_message_number != mcp_message.message_number:
-            if (expected_message_number - 1 == mcp_message.message_number and
+        if expected_message_number != mpp_message.message_number:
+            if (expected_message_number - 1 == mpp_message.message_number and
                     port.is_resuming(slot)):
                 _logger.debug(f'Discarding received message on {port_and_slot}'
                               ': resuming from weakly consistent snapshot')
@@ -333,16 +338,16 @@ def receive_message(self, port_name: str, slot: Optional[int] = None,
                 return self.receive_message(port_name, slot, default)
             raise RuntimeError(f'Received message on {port_and_slot} with'
                                ' unexpected message number'
-                               f' {mcp_message.message_number}. Was expecting'
+                               f' {mpp_message.message_number}. Was expecting'
                                f' {expected_message_number}. Are you resuming'
                                ' from an inconsistent snapshot?')
         port.increment_num_messages(slot)
 
         _logger.debug('Received message on {}'.format(port_and_slot))
-        if isinstance(mcp_message.data, ClosePort):
+        if isinstance(mpp_message.data, ClosePort):
             _logger.debug('Port {} is now closed'.format(port_and_slot))
 
-        return message
+        return message, mpp_message.saved_until
 
     def close_port(self, port_name: str, slot: Optional[int] = None
                    ) -> None:
diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index a94ccb76..1f8dc1dd 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -118,8 +118,8 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None,
         checkpoint_info = self.__manager.get_checkpoint_info(
                 self._instance_name())
 
-        utc_reference, checkpoints = checkpoint_info[0:2]
-        self._trigger_manager.set_checkpoint_info(utc_reference, checkpoints)
+        elapsed_time, checkpoints = checkpoint_info[0:2]
+        self._trigger_manager.set_checkpoint_info(elapsed_time, checkpoints)
 
         resume_snapshot, snapshot_dir = checkpoint_info[2:4]
         saved_at = self._snapshot_manager.prepare_resume(
@@ -378,7 +378,9 @@ def send(self, port_name: str, message: Message,
             message = copy(message)
             message.settings = self._settings_manager.overlay
 
-        self._communicator.send_message(port_name, message, slot)
+        self._communicator.send_message(
+                port_name, message, slot,
+                self._trigger_manager.checkpoints_considered_until())
 
     def receive(self, port_name: str, slot: Optional[int] = None,
                 default: Optional[Message] = None
@@ -826,7 +828,7 @@ def __receive_message(
                     raise RuntimeError(err_msg)
 
         else:
-            msg = self._communicator.receive_message(
+            msg, saved_until = self._communicator.receive_message(
                     port_name, slot, default)
             if port.is_connected() and not port.is_open(slot):
                 err_msg = (('Port {} was closed while trying to'
@@ -838,6 +840,7 @@ def __receive_message(
                 self.__check_compatibility(port_name, msg.settings)
             if not with_settings:
                 msg.settings = None
+            self._trigger_manager.harmonise_wall_time(saved_until)
         return msg
 
     def __make_full_name(self
@@ -942,7 +945,7 @@ def __receive_settings(self) -> bool:
             False iff the port is connnected and ClosePort was received.
         """
         default_message = Message(0.0, None, Settings(), Settings())
-        message = self._communicator.receive_message(
+        message, saved_until = self._communicator.receive_message(
                 'muscle_settings_in', None, default_message)
         if isinstance(message.data, ClosePort):
             return False
@@ -959,6 +962,8 @@ def __receive_settings(self) -> bool:
         for key, value in message.data.items():
             settings[key] = value
         self._settings_manager.overlay = settings
+
+        self._trigger_manager.harmonise_wall_time(saved_until)
         return True
 
     def __pre_receive_f_init(self, apply_overlay: bool) -> None:
@@ -968,12 +973,14 @@ def __pre_receive_f_init(self, apply_overlay: bool) -> None:
         in self._f_init_cache.
         """
         def pre_receive(port_name: str, slot: Optional[int]) -> None:
-            msg = self._communicator.receive_message(port_name, slot)
+            msg, saved_until = self._communicator.receive_message(
+                    port_name, slot)
             self._f_init_cache[(port_name, slot)] = msg
             if apply_overlay:
                 self.__apply_overlay(msg)
                 self.__check_compatibility(port_name, msg.settings)
                 msg.settings = None
+            self._trigger_manager.harmonise_wall_time(saved_until)
 
         self._f_init_cache = dict()
         ports = self._communicator.list_ports()
diff --git a/libmuscle/python/libmuscle/manager/mmp_server.py b/libmuscle/python/libmuscle/manager/mmp_server.py
index f689e6f5..d609fce1 100644
--- a/libmuscle/python/libmuscle/manager/mmp_server.py
+++ b/libmuscle/python/libmuscle/manager/mmp_server.py
@@ -1,6 +1,6 @@
-from datetime import datetime, timezone
 import errno
 import logging
+import time
 from typing import Any, Dict, cast, List, Optional
 
 import msgpack
@@ -74,8 +74,7 @@ def __init__(
         self._topology_store = topology_store
         self._snapshot_registry = snapshot_registry
         self._run_dir = run_dir
-        self._reference_time = datetime.now(timezone.utc)
-        self._reference_timestamp = self._reference_time.timestamp()
+        self._reference_time = time.monotonic()
 
     def handle_request(self, request: bytes) -> bytes:
         """Handles a manager request.
@@ -303,7 +302,7 @@ def _get_checkpoint_info(self, instance_id: str) -> Any:
             snapshot_directory = str(self._run_dir.snapshot_dir(instance))
 
         return [ResponseType.SUCCESS.value,
-                self._reference_timestamp,
+                time.monotonic() - self._reference_time,
                 encode_checkpoints(self._configuration.checkpoints),
                 resume,
                 snapshot_directory]
diff --git a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
index 876ae197..bc61f0a0 100644
--- a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
+++ b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
@@ -1,5 +1,4 @@
 import dataclasses
-from datetime import datetime, timezone
 from pathlib import Path
 from unittest.mock import MagicMock
 
@@ -109,10 +108,9 @@ def test_get_checkpoint_info(mmp_configuration, mmp_request_handler):
     decoded_result = msgpack.unpackb(result, raw=False)
 
     assert decoded_result[0] == ResponseType.SUCCESS.value
-    timestamp, checkpoints, resume, snapshot_directory = decoded_result[1:]
+    elapsed_time, checkpoints, resume, snapshot_directory = decoded_result[1:]
 
-    ref_time = datetime.fromtimestamp(timestamp, tz=timezone.utc)
-    assert ref_time == mmp_request_handler._reference_time
+    assert elapsed_time > 0.0
 
     assert isinstance(checkpoints, dict)
     assert checkpoints.keys() == {'at_end', 'wallclock_time', 'simulation_time'}
diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py
index 14e83e9a..eed4d99a 100644
--- a/libmuscle/python/libmuscle/mmp_client.py
+++ b/libmuscle/python/libmuscle/mmp_client.py
@@ -1,5 +1,4 @@
 import dataclasses
-from datetime import datetime, timezone
 from pathlib import Path
 from random import uniform
 from time import perf_counter, sleep
@@ -23,7 +22,7 @@
 PEER_INTERVAL_MAX = 10.0
 
 _CheckpointInfoType = Tuple[
-        datetime, Checkpoints, Optional[Path], Optional[Path]]
+        float, Checkpoints, Optional[Path], Optional[Path]]
 
 
 def encode_operator(op: Operator) -> str:
@@ -64,7 +63,7 @@ def decode_checkpoint_rule(rule: Dict[str, Any]) -> CheckpointRule:
 
 
 def decode_checkpoint_info(
-        reference_timestamp: float,
+        elapsed_time: float,
         checkpoints_dict: Dict[str, Any],
         resume: Optional[str],
         snapshot_dir: Optional[str]
@@ -72,19 +71,17 @@ def decode_checkpoint_info(
     """Decode checkpoint info from a MsgPack-compatible value.
 
     Args:
-        reference_timestamp: seconds since UNIX epoch in UTC timezone to use as
-            wallclock_time = 0
+        elapsed_time: current elapsed time according to the manager
         checkpoints_dict: checkpoint definitions from the MsgPack
         resume: path to the snapshot we should resume from, if any
         snapshot_dir: path to the directory to store new snapshots in
 
     Returns:
-        wallclock_time_reference: UTC time where wallclock_time = 0
+        elapsed_time: current elapsed time according to the manager
         checkpoints: checkpoint configuration
-        resume: path to the resume snapshot
-        snapshot_dir: path to store the snapshots in
+        resume: path to the snapshot we should resume from, if any
+        snapshot_dir: path to the directory to store new snapshots in
     """
-    ref_time = datetime.fromtimestamp(reference_timestamp, tz=timezone.utc)
     checkpoints = Checkpoints(
             at_end=checkpoints_dict["at_end"],
             wallclock_time=[decode_checkpoint_rule(rule)
@@ -93,7 +90,7 @@ def decode_checkpoint_info(
                              for rule in checkpoints_dict["simulation_time"]])
     resume_path = None if resume is None else Path(resume)
     snapshot_path = None if snapshot_dir is None else Path(snapshot_dir)
-    return (ref_time, checkpoints, resume_path, snapshot_path)
+    return (elapsed_time, checkpoints, resume_path, snapshot_path)
 
 
 class MMPClient():
@@ -173,7 +170,7 @@ def get_checkpoint_info(self, name: Reference) -> _CheckpointInfoType:
         """Get the checkpoint info from the manager.
 
         Returns:
-            wallclock_time_reference: UTC time where wallclock_time = 0
+            elapsed_time: current elapsed time
             checkpoints: checkpoint configuration
             resume: path to the resume snapshot
             snapshot_directory: path to store snapshots
diff --git a/libmuscle/python/libmuscle/mpp_message.py b/libmuscle/python/libmuscle/mpp_message.py
index b9033d75..c57effb3 100644
--- a/libmuscle/python/libmuscle/mpp_message.py
+++ b/libmuscle/python/libmuscle/mpp_message.py
@@ -151,7 +151,8 @@ class MPPMessage:
     def __init__(self, sender: Reference, receiver: Reference,
                  port_length: Optional[int],
                  timestamp: float, next_timestamp: Optional[float],
-                 settings_overlay: Settings, message_number: int, data: Any
+                 settings_overlay: Settings, message_number: int,
+                 saved_until: float, data: Any
                  ) -> None:
         """Create an MPPMessage.
 
@@ -169,6 +170,9 @@ def __init__(self, sender: Reference, receiver: Reference,
             receiver: The receiving endpoint.
             port_length: Length of the slot, where applicable.
             settings_overlay: The serialised overlay settings.
+            message_number: Sequence number on this conduit.
+            saved_until: Elapsed time until which the sender has
+                processed checkpoints.
             data: The serialised contents of the message.
         """
         # make sure timestamp and next_timestamp are floats
@@ -183,6 +187,7 @@ def __init__(self, sender: Reference, receiver: Reference,
         self.next_timestamp = next_timestamp
         self.settings_overlay = settings_overlay
         self.message_number = message_number
+        self.saved_until = saved_until
         if isinstance(data, np.ndarray):
             self.data = Grid(data)
         else:
@@ -204,11 +209,12 @@ def from_bytes(message: bytes) -> 'MPPMessage':
         next_timestamp = message_dict["next_timestamp"]
         settings_overlay = message_dict["settings_overlay"]
         message_number = message_dict["message_number"]
+        saved_until = message_dict["saved_until"]
 
         data = message_dict["data"]
         return MPPMessage(
                 sender, receiver, port_length, timestamp, next_timestamp,
-                settings_overlay, message_number, data)
+                settings_overlay, message_number, saved_until, data)
 
     def encoded(self) -> bytes:
         """Encode the message and return as a bytes buffer.
@@ -221,6 +227,7 @@ def encoded(self) -> bytes:
                 'next_timestamp': self.next_timestamp,
                 'settings_overlay': self.settings_overlay,
                 'message_number': self.message_number,
+                'saved_until': self.saved_until,
                 'data': self.data
                 }
 
diff --git a/libmuscle/python/libmuscle/snapshot.py b/libmuscle/python/libmuscle/snapshot.py
index 633d3f3d..2f86a220 100644
--- a/libmuscle/python/libmuscle/snapshot.py
+++ b/libmuscle/python/libmuscle/snapshot.py
@@ -84,7 +84,7 @@ def message_to_bytes(message: Optional['communicator.Message']) -> bytes:
             settings = message.settings
         return MPPMessage(Reference('_'), Reference('_'), None,
                           message.timestamp, message.next_timestamp,
-                          settings, 0, message.data).encoded()
+                          settings, 0, -1.0, message.data).encoded()
 
     @staticmethod
     def bytes_to_message(data: bytes) -> Optional['communicator.Message']:
diff --git a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
index 388e6eca..e111a758 100644
--- a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py
@@ -1,4 +1,3 @@
-from datetime import datetime, timedelta, timezone
 import time
 import pytest
 from ymmsl import CheckpointRangeRule, CheckpointAtRule, Checkpoints
@@ -137,20 +136,19 @@ def test_combined_checkpoint_trigger_at_ranges():
 
 
 def test_trigger_manager_reference_time():
-    monotonic_now = time.monotonic()
-    utcnow = datetime.now(timezone.utc)
-    reference = utcnow - timedelta(seconds=15)
+    monotonic_start = time.monotonic()
+    ref_elapsed = 15.0
     trigger_manager = TriggerManager()
-    trigger_manager.set_checkpoint_info(reference, Checkpoints(at_end=True))
+    trigger_manager.set_checkpoint_info(ref_elapsed, Checkpoints(at_end=True))
     elapsed_walltime = trigger_manager.elapsed_walltime()
-    elapsed_monotonic = time.monotonic() - monotonic_now
-    assert 15.0 < elapsed_walltime <= (15.0 + elapsed_monotonic)
+    duration = time.monotonic() - monotonic_start
+    assert ref_elapsed < elapsed_walltime <= (ref_elapsed + duration)
 
 
 def test_trigger_manager():
-    reference = datetime.now(timezone.utc)
+    ref_elapsed = 0.0
     trigger_manager = TriggerManager()
-    trigger_manager.set_checkpoint_info(reference, Checkpoints(
+    trigger_manager.set_checkpoint_info(ref_elapsed, Checkpoints(
             at_end=True,
             wallclock_time=[CheckpointAtRule([1e-12])],
             simulation_time=[CheckpointAtRule([1, 3, 5])]))
@@ -181,8 +179,7 @@ def test_trigger_manager():
 
 def test_no_checkpointing() -> None:
     trigger_manager = TriggerManager()
-    trigger_manager.set_checkpoint_info(
-            datetime.now(timezone.utc), Checkpoints())
+    trigger_manager.set_checkpoint_info(0.0, Checkpoints())
 
     assert not trigger_manager.should_save_snapshot(1)
     assert not trigger_manager.should_save_snapshot(5000)
diff --git a/libmuscle/python/libmuscle/test/test_communicator.py b/libmuscle/python/libmuscle/test/test_communicator.py
index a297a820..140e9399 100644
--- a/libmuscle/python/libmuscle/test/test_communicator.py
+++ b/libmuscle/python/libmuscle/test/test_communicator.py
@@ -397,28 +397,31 @@ def test_receive_message(communicator) -> None:
     client_mock = MagicMock()
     client_mock.receive.return_value = MPPMessage(
             Reference('other.out[13]'), Reference('kernel[13].in'),
-            None, 0.0, None, Settings({'test1': 12}), 0,
+            None, 0.0, None, Settings({'test1': 12}), 0, 2.0,
             b'test').encoded()
     get_client_mock = MagicMock(return_value=client_mock)
     communicator._Communicator__get_client = get_client_mock
     communicator._profiler = MagicMock()
 
-    msg = communicator.receive_message('in')
+    msg, last_saved = communicator.receive_message('in')
 
     get_client_mock.assert_called_with(Reference('other'))
     client_mock.receive.assert_called_with(Reference('kernel[13].in'))
     assert msg.data == b'test'
     assert msg.settings['test1'] == 12
+    assert last_saved == 2.0
 
 
 def test_receive_message_default(communicator) -> None:
     communicator._peer_manager.is_connected.return_value = False
     default_msg = Message(3.0, 4.0, 'test', Settings())
-    msg = communicator.receive_message('not_connected', default=default_msg)
+    msg, last_saved = communicator.receive_message(
+            'not_connected', default=default_msg)
     assert msg.timestamp == 3.0
     assert msg.next_timestamp == 4.0
     assert msg.data == 'test'
     assert len(msg.settings) == 0
+    assert last_saved == float('-inf')
 
 
 def test_receive_message_no_default(communicator) -> None:
@@ -436,71 +439,75 @@ def test_receive_msgpack(communicator) -> None:
     client_mock = MagicMock()
     client_mock.receive.return_value = MPPMessage(
             Reference('other.out[13]'), Reference('kernel[13].in'),
-            None, 0.0, None, Settings({'test1': 12}), 0,
+            None, 0.0, None, Settings({'test1': 12}), 0, 1.0,
             {'test': 13}).encoded()
     get_client_mock = MagicMock(return_value=client_mock)
     communicator._Communicator__get_client = get_client_mock
     communicator._profiler = MagicMock()
 
-    msg = communicator.receive_message('in')
+    msg, last_saved = communicator.receive_message('in')
 
     get_client_mock.assert_called_with(Reference('other'))
     client_mock.receive.assert_called_with(Reference('kernel[13].in'))
     assert msg.data == {'test': 13}
+    assert last_saved == 1.0
 
 
 def test_receive_with_slot(communicator2) -> None:
     client_mock = MagicMock()
     client_mock.receive.return_value = MPPMessage(
             Reference('kernel[13].out'), Reference('other.in[13]'),
-            None, 0.0, None, Settings({'test': 'testing'}), 0,
+            None, 0.0, None, Settings({'test': 'testing'}), 0, 3.0,
             b'test').encoded()
     get_client_mock = MagicMock(return_value=client_mock)
     communicator2._Communicator__get_client = get_client_mock
     communicator2._profiler = MagicMock()
 
-    msg = communicator2.receive_message('in', 13)
+    msg, last_saved = communicator2.receive_message('in', 13)
 
     get_client_mock.assert_called_with(Reference('kernel[13]'))
     client_mock.receive.assert_called_with(Reference('other.in[13]'))
     assert msg.data == b'test'
     assert msg.settings['test'] == 'testing'
+    assert last_saved == 3.0
 
 
 def test_receive_message_resizable(communicator3) -> None:
     client_mock = MagicMock()
     client_mock.receive.return_value = MPPMessage(
             Reference('other.out[13]'), Reference('kernel.in[13]'),
-            20, 0.0, None, Settings({'test': 'testing'}), 0,
+            20, 0.0, None, Settings({'test': 'testing'}), 0, 12.3,
             b'test').encoded()
     get_client_mock = MagicMock(return_value=client_mock)
     communicator3._Communicator__get_client = get_client_mock
     communicator3._profiler = MagicMock()
 
-    msg = communicator3.receive_message('in', 13)
+    msg, last_saved = communicator3.receive_message('in', 13)
 
     get_client_mock.assert_called_with(Reference('other'))
     client_mock.receive.assert_called_with(Reference('kernel.in[13]'))
     assert msg.data == b'test'
     assert communicator3.get_port('in').get_length() == 20
+    assert last_saved == 12.3
 
 
 def test_receive_with_settings(communicator) -> None:
     client_mock = MagicMock()
     client_mock.receive.return_value = MPPMessage(
             Reference('other.out[13]'), Reference('kernel[13].in'),
-            None, 0.0, None, Settings({'test2': 3.1}), 0,
+            None, 0.0, None, Settings({'test2': 3.1}), 0, 0.1,
             b'test').encoded()
     get_client_mock = MagicMock(return_value=client_mock)
     communicator._Communicator__get_client = get_client_mock
     communicator._profiler = MagicMock()
 
-    msg = communicator.receive_message('in')
+    msg, last_saved = communicator.receive_message('in')
 
     get_client_mock.assert_called_with(Reference('other'))
     client_mock.receive.assert_called_with(Reference('kernel[13].in'))
     assert msg.data == b'test'
     assert msg.settings['test2'] == 3.1
+    assert last_saved == 0.1
 
 
 def test_receive_msgpack_with_slot_and_settings(communicator2) -> None:
@@ -508,56 +515,58 @@ def test_receive_msgpack_with_slot_and_settings(communicator2) -> None:
     client_mock.receive.return_value = MPPMessage(
             Reference('kernel[13].out'), Reference('other.in[13]'),
             None, 0.0, 1.0,
-            Settings({'test': 'testing'}), 0, 'test').encoded()
+            Settings({'test': 'testing'}), 0, 1.0, 'test').encoded()
     get_client_mock = MagicMock(return_value=client_mock)
     communicator2._Communicator__get_client = get_client_mock
     communicator2._profiler = MagicMock()
 
-    msg = communicator2.receive_message('in', 13)
+    msg, last_saved = communicator2.receive_message('in', 13)
 
     get_client_mock.assert_called_with(Reference('kernel[13]'))
     client_mock.receive.assert_called_with(Reference('other.in[13]'))
     assert msg.data == 'test'
     assert msg.settings['test'] == 'testing'
+    assert last_saved == 1.0
 
 
 def test_receive_settings(communicator) -> None:
     client_mock = MagicMock()
     client_mock.receive.return_value = MPPMessage(
             Reference('other.out[13]'), Reference('kernel[13].in'),
-            None, 0.0, None, Settings({'test1': 12}), 0,
+            None, 0.0, None, Settings({'test1': 12}), 0, 1.0,
             Settings({'test': 13})).encoded()
     get_client_mock = MagicMock(return_value=client_mock)
     communicator._Communicator__get_client = get_client_mock
     communicator._profiler = MagicMock()
 
-    msg = communicator.receive_message('in')
+    msg, last_saved = communicator.receive_message('in')
 
     get_client_mock.assert_called_with(Reference('other'))
     client_mock.receive.assert_called_with(Reference('kernel[13].in'))
     assert isinstance(msg.data, Settings)
     assert msg.data['test'] == 13
+    assert last_saved == 1.0
 
 
 def test_receive_close_port(communicator) -> None:
     client_mock = MagicMock()
     client_mock.receive.return_value = MPPMessage(
             Reference('other.out[13]'), Reference('kernel[13].in'),
-            None, 0.0, None, Settings(), 0, ClosePort()).encoded()
+            None, 0.0, None, Settings(), 0, 0.1, ClosePort()).encoded()
     get_client_mock = MagicMock(return_value=client_mock)
     communicator._Communicator__get_client = get_client_mock
     communicator._profiler = MagicMock()
 
-    msg = communicator.receive_message('in')
+    msg, _ = communicator.receive_message('in')
 
     assert isinstance(msg.data, ClosePort)
 
 
 def test_get_message(communicator, message) -> None:
-    communicator.send_message('out', message)
+    communicator.send_message('out', message, None, 2.0)
     ref_message = MPPMessage(
             Reference('kernel[13].out'), Reference('other.in[13]'),
-            None, 0.0, None, Settings(), 0, b'test').encoded()
+            None, 0.0, None, Settings(), 0, 2.0, b'test').encoded()
     assert communicator._post_office.get_message(
             'other.in[13]') == ref_message
 
@@ -616,7 +625,7 @@ def test_port_count_validation(communicator):
     client_mock = MagicMock()
     client_mock.receive.return_value = MPPMessage(
             Reference('other.out[13]'), Reference('kernel[13].in'),
-            None, 0.0, None, Settings({'test1': 12}), 0,
+            None, 0.0, None, Settings({'test1': 12}), 0, 7.6,
             b'test').encoded()
     get_client_mock = MagicMock(return_value=client_mock)
     communicator._Communicator__get_client = get_client_mock
@@ -634,7 +643,7 @@ def test_port_discard_error_on_resume(caplog, communicator):
     client_mock = MagicMock()
     client_mock.receive.return_value = MPPMessage(
             Reference('other.out[13]'), Reference('kernel[13].in'),
-            None, 0.0, None, Settings({'test1': 12}), 1,
+            None, 0.0, None, Settings({'test1': 12}), 1, 2.3,
             b'test').encoded()
     get_client_mock = MagicMock(return_value=client_mock)
     communicator._Communicator__get_client = get_client_mock
@@ -661,7 +670,7 @@ def test_port_discard_success_on_resume(caplog, communicator):
     client_mock = MagicMock()
     client_mock.receive.side_effect = [MPPMessage(
             Reference('other.out[13]'), Reference('kernel[13].in'),
-            None, 0.0, None, Settings({'test1': 12}), message_number,
+            None, 0.0, None, Settings({'test1': 12}), message_number, 1.0,
             {'this is message': message_number}).encoded()
             for message_number in [1, 2]]
     get_client_mock = MagicMock(return_value=client_mock)
@@ -676,7 +685,7 @@ def test_port_discard_success_on_resume(caplog, communicator):
         assert port.is_resuming(None)
 
     with caplog.at_level(logging.DEBUG, 'libmuscle.communicator'):
-        msg = communicator.receive_message('in')
+        msg, _ = communicator.receive_message('in')
         # records 0, 2 and 3 are debug logs for starting/receiving on port
         assert 'Discarding received message' in caplog.records[1].message
     # message_number=1 should be discarded:
diff --git a/libmuscle/python/libmuscle/test/test_instance.py b/libmuscle/python/libmuscle/test/test_instance.py
index 17e3e3e0..7c8b1be4 100644
--- a/libmuscle/python/libmuscle/test/test_instance.py
+++ b/libmuscle/python/libmuscle/test/test_instance.py
@@ -1,4 +1,3 @@
-from datetime import datetime, timezone
 import sys
 from typing import Generator
 from unittest.mock import MagicMock, patch
@@ -44,13 +43,12 @@ def instance(sys_argv_instance, tmp_path):
         settings = Settings()
         settings['test1'] = 12
         msg = Message(0.0, 1.0, 'message', settings)
-        communicator.receive_message.return_value = msg
+        communicator.receive_message.return_value = msg, 10.0
         comm_type.return_value = communicator
 
         mmp_client_object = MagicMock()
         mmp_client_object.request_peers.return_value = (None, None, None)
-        checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None,
-                           tmp_path)
+        checkpoint_info = (0.0, Checkpoints(), None, tmp_path)
         mmp_client_object.get_checkpoint_info.return_value = checkpoint_info
         mmp_client.return_value = mmp_client_object
 
@@ -68,8 +66,7 @@ def instance2(sys_argv_instance, tmp_path):
          patch('libmuscle.instance.Communicator'):
         mmp_client_object = MagicMock()
         mmp_client_object.request_peers.return_value = (None, None, None)
-        checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None,
-                           tmp_path)
+        checkpoint_info = (0.0, Checkpoints(), None, tmp_path)
         mmp_client_object.get_checkpoint_info.return_value = checkpoint_info
         mmp_client.return_value = mmp_client_object
         instance = Instance({
@@ -84,8 +81,7 @@ def test_create_instance(
          patch('libmuscle.instance.Communicator') as comm_type:
         mmp_client_object = MagicMock()
         mmp_client_object.request_peers.return_value = (None, None, None)
-        checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None,
-                           tmp_path)
+        checkpoint_info = (0.0, Checkpoints(), None, tmp_path)
         mmp_client_object.get_checkpoint_info.return_value = checkpoint_info
         mmp_client.return_value = mmp_client_object
         ports = {
@@ -169,9 +165,10 @@ def test_is_vector_port(instance):
 
 
 def test_send(instance, message):
+    instance._trigger_manager._cpts_considered_until = 17.0
     instance.send('out', message, 1)
     assert instance._communicator.send_message.called_with(
-            'out', message, 1)
+            'out', message, 1, 17.0)
 
 
 def test_send_invalid_port(instance, message):
@@ -242,7 +239,8 @@ def test_reuse_instance_receive_overlay(instance):
     test_overlay = Settings()
     test_overlay['test2'] = 'abc'
     recv = instance._communicator.receive_message
-    recv.return_value = Message(0.0, None, test_overlay, test_base_settings)
+    msg = Message(0.0, None, test_overlay, test_base_settings)
+    recv.return_value = msg, 0.0
     instance.reuse_instance()
     assert instance._communicator.receive_message.called_with(
         'muscle_settings_in')
@@ -254,9 +252,9 @@ def test_reuse_instance_receive_overlay(instance):
 def test_reuse_instance_closed_port(instance):
     def receive_message(port_name, slot=None, default=None):
         if port_name == 'muscle_settings_in':
-            return Message(0.0, None, Settings(), Settings())
+            return Message(0.0, None, Settings(), Settings()), 0.0
         elif port_name == 'in':
-            return Message(0.0, None, ClosePort(), Settings())
+            return Message(0.0, None, ClosePort(), Settings()), 1.0
         assert False    # pragma: no cover
 
     def get_port(port_name):
@@ -282,10 +280,10 @@ def get_port(port_name):
 def test_reuse_instance_vector_port(instance2):
     def receive_message(port_name, slot=None, default=None):
         if port_name == 'muscle_settings_in':
-            return Message(0.0, None, Settings(), Settings())
+            return Message(0.0, None, Settings(), Settings()), 0.0
         elif port_name == 'in':
             data = 'test {}'.format(slot)
-            return Message(0.0, None, data, Settings())
+            return Message(0.0, None, data, Settings()), 0.0
         assert False    # pragma: no cover
 
     instance2._communicator.receive_message = receive_message
@@ -310,7 +308,7 @@ def receive_message(port_name, slot=None, default=None):
 
 def test_reuse_instance_no_f_init_ports(instance):
     instance._communicator.receive_message.return_value = Message(
-            0.0, None, Settings(), Settings())
+            0.0, None, Settings(), Settings()), 0.0
     instance._communicator.list_ports.return_value = {}
     instance._communicator.settings_in_connected.return_value = False
     do_reuse = instance.reuse_instance()
diff --git a/libmuscle/python/libmuscle/test/test_mpp_message.py b/libmuscle/python/libmuscle/test/test_mpp_message.py
index dce3ed88..aaebe351 100644
--- a/libmuscle/python/libmuscle/test/test_mpp_message.py
+++ b/libmuscle/python/libmuscle/test/test_mpp_message.py
@@ -15,10 +15,12 @@ def test_create() -> None:
     timestamp = 10.0
     next_timestamp = 11.0
     settings_overlay = (6789).to_bytes(2, 'little', signed=True)
+    message_number = 0
+    saved_until = 1.6
     data = (12345).to_bytes(2, 'little', signed=True)
     msg = MPPMessage(
             sender, receiver, None, timestamp, next_timestamp,
-            settings_overlay, 0, data)
+            settings_overlay, message_number, saved_until, data)
     assert msg.sender == sender
     assert msg.receiver == receiver
     assert msg.port_length is None
@@ -26,6 +28,7 @@ def test_create() -> None:
     assert msg.next_timestamp == 11.0
     assert msg.settings_overlay == settings_overlay
     assert msg.message_number == 0
+    assert msg.saved_until == 1.6
     assert msg.data == data
 
 
@@ -44,7 +47,7 @@ def test_grid_encode() -> None:
     grid = Grid(array, ['x', 'y', 'z'])
     msg = MPPMessage(
             sender, receiver, None, timestamp, next_timestamp, Settings(),
-            0, grid)
+            0, 1.0, grid)
 
     wire_data = msg.encoded()
     mcp_decoded = msgpack.unpackb(wire_data, raw=False)
@@ -88,6 +91,7 @@ def test_grid_decode() -> None:
             'next_timestamp': None,
             'settings_overlay': msgpack.ExtType(1, settings_data),
             'message_number': 0,
+            'saved_until': 9.9,
             'data': msgpack.ExtType(2, grid_data)}
 
     wire_data = msgpack.packb(msg_dict, use_bin_type=True)
@@ -137,7 +141,7 @@ def test_grid_roundtrip() -> None:
         grid = Grid(array, ['x', 'y', 'z'])
         msg = MPPMessage(
                 sender, receiver, None, timestamp, next_timestamp, Settings(),
-                0, grid)
+                0, 1.0, grid)
 
         wire_data = msg.encoded()
         msg_out = MPPMessage.from_bytes(wire_data)
@@ -171,7 +175,7 @@ def test_non_contiguous_grid_roundtrip() -> None:
     grid = Grid(array.real, ['a', 'b', 'c'])
     msg = MPPMessage(
             sender, receiver, None, timestamp, next_timestamp, Settings(),
-            0, grid)
+            0, 7.7, grid)
 
     wire_data = msg.encoded()
     msg_out = MPPMessage.from_bytes(wire_data)
diff --git a/libmuscle/python/libmuscle/test/test_outbox.py b/libmuscle/python/libmuscle/test/test_outbox.py
index cb4af31a..a2e97c40 100644
--- a/libmuscle/python/libmuscle/test/test_outbox.py
+++ b/libmuscle/python/libmuscle/test/test_outbox.py
@@ -19,7 +19,7 @@ def message():
             Ref('sender.out'), Ref('receiver.in'),
             None, 0.0, 1.0,
             bytes(),
-            0,
+            0, 1.0,
             'testing'.encode('utf-8'))
 
 

From fc9168dd8a046694e4c0bf294cd82f657daca87b Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 10 Jan 2023 13:29:39 +0100
Subject: [PATCH 121/183] Improved API Guard error messages

---
 libmuscle/python/libmuscle/api_guard.py | 87 ++++++++++++++++---------
 1 file changed, 57 insertions(+), 30 deletions(-)

diff --git a/libmuscle/python/libmuscle/api_guard.py b/libmuscle/python/libmuscle/api_guard.py
index 1f4fbfd3..56012448 100644
--- a/libmuscle/python/libmuscle/api_guard.py
+++ b/libmuscle/python/libmuscle/api_guard.py
@@ -22,6 +22,9 @@ class APIPhase(Enum):
     we know that we should expect resume() after reuse_instance() and
     we use BEFORE_RESUMING accordingly.
     """
+    BEFORE_FIRST_REUSE_INSTANCE = auto()
+    """Before the first time calling reuse_instance"""
+
     BEFORE_REUSE_INSTANCE = auto()
     """Before calling reuse_instance"""
 
@@ -62,11 +65,47 @@ class APIGuard:
     def __init__(self) -> None:
         """Create an APIPhaseTracker.
 
-        This starts the tracker in BEFORE_REUSE_INSTANCE.
+        This starts the tracker in BEFORE_FIRST_REUSE_INSTANCE.
         """
-        self._phase = APIPhase.BEFORE_REUSE_INSTANCE
+        self._phase = APIPhase.BEFORE_FIRST_REUSE_INSTANCE
         self._uses_checkpointing = None     # type: Optional[bool]
 
+    def _generic_error_messages(self, verify_phase: str) -> None:
+        if self._phase in (
+                APIPhase.BEFORE_FIRST_REUSE_INSTANCE,
+                APIPhase.AFTER_REUSE_LOOP):
+            msg = f'Please only call {verify_phase} inside the reuse loop.'
+        elif self._phase == APIPhase.BEFORE_REUSE_INSTANCE:
+            msg = (
+                    'Please do not call {verify_phase} after'
+                    ' should_save_final_snapshot. should_save_final_snapshot'
+                    ' should be at the end of the reuse loop.')
+        elif self._phase == APIPhase.AFTER_REUSE_INSTANCE:
+            msg = (
+                    'Please call resuming first in the reuse loop, before'
+                    f' {verify_phase}')
+        elif self._phase == APIPhase.BEFORE_RESUMING:
+            msg = 'Inside the reuse loop you must call resuming first.'
+        elif self._phase == APIPhase.BEFORE_LOAD_SNAPSHOT:
+            msg = (
+                    'If resuming returns True, then you must call'
+                    ' load_snapshot first.')
+        elif self._phase == APIPhase.BEFORE_SHOULD_INIT:
+            msg = 'After calling resuming, you must call should_init first.'
+        elif self._phase == APIPhase.BEFORE_SHOULD_SAVE_SNAPSHOT:
+            msg = 'You must call save_snapshot or save_final_snapshot first.'
+        elif self._phase == APIPhase.BEFORE_SAVE_SNAPSHOT:
+            msg = (
+                    'If should_save_snapshot returns True, then you must'
+                    ' call save_snapshot first.')
+        elif self._phase == APIPhase.BEFORE_SAVE_FINAL_SNAPSHOT:
+            msg = (
+                    'If should_save_final_snapshot returns True, then you'
+                    ' must call save_final_snapshot first.')
+        else:
+            return
+        raise RuntimeError(msg)
+
     def uses_checkpointing(self) -> bool:
         """Return whether the code is using checkpointing.
 
@@ -90,8 +129,13 @@ def verify_reuse_instance(self) -> None:
         """Check reuse_instance()"""
         if self._phase == APIPhase.AFTER_REUSE_INSTANCE:
             self._uses_checkpointing = False
-        elif self._phase != APIPhase.BEFORE_REUSE_INSTANCE:
-            raise RuntimeError()
+        elif self._phase not in (
+                APIPhase.BEFORE_REUSE_INSTANCE,
+                APIPhase.BEFORE_FIRST_REUSE_INSTANCE):
+            raise RuntimeError(
+                    'We reached the end of the reuse loop without checking'
+                    ' if a snapshot should be saved. Please add at least'
+                    ' a should_save_final_snapshot and save_final_snapshot.')
 
     def reuse_instance_done(self, reusing: bool) -> None:
         """Update phase on successful reuse_instance().
@@ -154,10 +198,8 @@ def should_init_done(self) -> None:
     def verify_should_save_snapshot(self) -> None:
         """Check should_save_snapshot()"""
         if self._phase != APIPhase.BEFORE_SHOULD_SAVE_SNAPSHOT:
-            raise RuntimeError(
-                    'We reached the end of the reuse loop without checking'
-                    ' if a snapshot should be saved. Please add at least'
-                    ' a should_save_final_snapshot and save_final_snapshot.')
+            self._generic_error_messages('should_save_snapshot')
+            raise RuntimeError()  # should be unreachable
 
     def should_save_snapshot_done(self, should_save: bool) -> None:
         """Update phase on successful should_save_snapshot().
@@ -169,9 +211,10 @@ def should_save_snapshot_done(self, should_save: bool) -> None:
             self._phase = APIPhase.BEFORE_SAVE_SNAPSHOT
 
     def verify_save_snapshot(self) -> None:
-        """Check should_save_snapshot()"""
+        """Check save_snapshot()"""
         if self._phase != APIPhase.BEFORE_SAVE_SNAPSHOT:
-            raise RuntimeError()
+            self._generic_error_messages('save_snapshot')
+            raise RuntimeError()  # should be unreachable
 
     def save_snapshot_done(self) -> None:
         """Update phase on successful save_snapshot()"""
@@ -180,25 +223,8 @@ def save_snapshot_done(self) -> None:
     def verify_should_save_final_snapshot(self) -> None:
         """Check should_save_final_snapshot()."""
         if self._phase != APIPhase.BEFORE_SHOULD_SAVE_SNAPSHOT:
-            if self._phase in (
-                    APIPhase.BEFORE_REUSE_INSTANCE, APIPhase.AFTER_REUSE_LOOP):
-                msg = (
-                        'Please only call should_save_final_snapshot inside'
-                        ' the reuse loop.')
-            elif self._phase == APIPhase.BEFORE_SAVE_FINAL_SNAPSHOT:
-                msg = (
-                        'If should_save_final_snapshot returns True, then you'
-                        ' must call save_final_snapshot immediately.')
-            elif self._phase == APIPhase.BEFORE_SAVE_SNAPSHOT:
-                msg = (
-                        'If should_save_snapshot returns True, then you must'
-                        ' call save_snapshot first.')
-            else:
-                msg = (
-                        'Please only call should_save_final_snapshot at the'
-                        ' end of the reuse loop.')
-
-            raise RuntimeError(msg)
+            self._generic_error_messages('should_save_final_snapshot')
+            raise RuntimeError()  # should be unreachable
 
     def should_save_final_snapshot_done(self, should_save: bool) -> None:
         """Update phase on successful should_save_snapshot().
@@ -214,7 +240,8 @@ def should_save_final_snapshot_done(self, should_save: bool) -> None:
     def verify_save_final_snapshot(self) -> None:
         """Check should_save_final_snapshot()"""
         if self._phase != APIPhase.BEFORE_SAVE_FINAL_SNAPSHOT:
-            raise RuntimeError()
+            self._generic_error_messages('save_final_snapshot')
+            raise RuntimeError()  # should be unreachable
 
     def save_final_snapshot_done(self) -> None:
         """Updates state on successful save_final_snapshot()"""

From ced9d99952334d158dd51b5cb72c8d889ffe919f Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 10 Jan 2023 13:53:25 +0100
Subject: [PATCH 122/183] Add logging to harmonise_wall_time

---
 libmuscle/python/libmuscle/checkpoint_triggers.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py
index 7ea2759d..6bf5f059 100644
--- a/libmuscle/python/libmuscle/checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/checkpoint_triggers.py
@@ -206,6 +206,9 @@ def harmonise_wall_time(self, at_least: float) -> None:
         """
         cur = self.elapsed_walltime()
         if cur < at_least:
+            _logger.debug(
+                    'Harmonise wall time: advancing clock by %f seconds',
+                    at_least - cur)
             self._mono_to_elapsed += at_least - cur
 
     def snapshots_enabled(self) -> bool:

From e402fdfcb50866335d6fdc3229f554652f8423e7 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 10 Jan 2023 13:55:57 +0100
Subject: [PATCH 123/183] Replace isinstance check for ImplementationState

---
 libmuscle/python/libmuscle/instance.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index 1f8dc1dd..8f8321f2 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -50,13 +50,7 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None,
         """
         self.__is_shut_down = False
 
-        if not isinstance(stateful, ImplementationState):
-            raise ValueError(
-                    f'Invalid value supplied for "stateful": {stateful}.'
-                    ' Expected one of ImplementationState.STATEFUL,'
-                    ' ImplementationState.STATELESS or ImplementationState.'
-                    'WEAKLY_STATEFUL.')
-        self._stateful = stateful
+        self._stateful = ImplementationState(stateful)
 
         # Note that these are accessed by Muscle3, but otherwise private.
         self._name, self._index = self.__make_full_name()

From 64544d872611688c621c2fc310fe5a5a2f3f9acc Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 10 Jan 2023 13:57:29 +0100
Subject: [PATCH 124/183] Verify snapshot Messages are not None

None messages indicate implicit snapshots, which are handled differently
---
 libmuscle/python/libmuscle/instance.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index 8f8321f2..bb559636 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -561,6 +561,8 @@ def save_snapshot(self, message: Message) -> None:
                 store the internal state of the submodel.
         """
         self._api_guard.verify_save_snapshot()
+        if message is None:
+            raise RuntimeError('Please specify a Message to save as snapshot.')
         self._save_snapshot(message, False)
         self._api_guard.save_snapshot_done()
 
@@ -627,6 +629,8 @@ def save_final_snapshot(self, message: Message) -> None:
                 submodel.
         """
         self._api_guard.verify_save_final_snapshot()
+        if message is None:
+            raise RuntimeError('Please specify a Message to save as snapshot.')
         self._save_snapshot(message, True, self.__f_init_max_timestamp)
         self._api_guard.save_final_snapshot_done()
 

From 4f7f0108b4fff3ab2a0fc454c05f5cc90d137859 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 5 Dec 2022 14:13:02 +0100
Subject: [PATCH 125/183] Fix should_save_final_snapshot when not reusing

ClosePort messages have `inf` timestamps, so would always trigger a
final snapshot. Only expected when `at_end` checkpoints should be taken.
---
 libmuscle/python/libmuscle/checkpoint_triggers.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py
index 6bf5f059..b0064c75 100644
--- a/libmuscle/python/libmuscle/checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/checkpoint_triggers.py
@@ -233,9 +233,10 @@ def should_save_final_snapshot(
             return False
 
         value = False
-        if not do_reuse and self._checkpoint_at_end:
-            value = True
-            self._last_triggers.append('at_end')
+        if not do_reuse:
+            if self._checkpoint_at_end:
+                value = True
+                self._last_triggers.append('at_end')
         elif f_init_max_timestamp is None:
             # No F_INIT messages received: reuse triggered on muscle_settings_in
             # message.

From 8a9538f25cd90f6e87068d011f7dddcd8c2feba9 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Tue, 10 Jan 2023 21:58:57 +0100
Subject: [PATCH 126/183] Incorporate latest yMMSL changes

---
 .../test_snapshot_complex_coupling.py         |  4 ++--
 integration_test/test_snapshot_dispatch.py    |  4 ++--
 integration_test/test_snapshot_macro_micro.py |  6 +++---
 libmuscle/python/libmuscle/instance.py        | 21 ++++++++++---------
 .../manager/test/test_snapshot_registry.py    | 18 ++++++++--------
 5 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/integration_test/test_snapshot_complex_coupling.py b/integration_test/test_snapshot_complex_coupling.py
index a75a89ce..dad2ee34 100644
--- a/integration_test/test_snapshot_complex_coupling.py
+++ b/integration_test/test_snapshot_complex_coupling.py
@@ -2,7 +2,7 @@
 import time
 
 import pytest
-from ymmsl import ImplementationState, Operator, load, dump
+from ymmsl import KeepsStateForNextUse, Operator, load, dump
 
 from libmuscle import Instance, Message
 from libmuscle.manager.run_dir import RunDir
@@ -57,7 +57,7 @@ def cache_component(max_channels=2):
 def echo_component(max_channels=2):
     ports = {Operator.F_INIT: [f'in{i+1}' for i in range(max_channels)],
              Operator.O_F: [f'out{i+1}' for i in range(max_channels)]}
-    instance = Instance(ports, stateful=ImplementationState.STATELESS)
+    instance = Instance(ports, keeps_state_for_next_use=KeepsStateForNextUse.NO)
 
     while instance.reuse_instance():
         for p_in, p_out in zip(ports[Operator.F_INIT], ports[Operator.O_F]):
diff --git a/integration_test/test_snapshot_dispatch.py b/integration_test/test_snapshot_dispatch.py
index 106f6d3c..7102a43c 100644
--- a/integration_test/test_snapshot_dispatch.py
+++ b/integration_test/test_snapshot_dispatch.py
@@ -1,5 +1,5 @@
 import pytest
-from ymmsl import ImplementationState, Operator, load, dump
+from ymmsl import KeepsStateForNextUse, Operator, load, dump
 
 from libmuscle import Instance, Message
 from libmuscle.manager.run_dir import RunDir
@@ -47,7 +47,7 @@ def stateless_component():
     instance = Instance({
             Operator.F_INIT: ['f_i'],
             Operator.O_F: ['o_f']},
-            stateful=ImplementationState.STATELESS)
+            stateful=KeepsStateForNextUse.NO)
 
     while instance.reuse_instance():
         dt = instance.get_setting('dt', 'float')
diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py
index f8b11cb4..885ac704 100644
--- a/integration_test/test_snapshot_macro_micro.py
+++ b/integration_test/test_snapshot_macro_micro.py
@@ -1,5 +1,5 @@
 import pytest
-from ymmsl import ImplementationState, Operator, load, dump
+from ymmsl import KeepsStateForNextUse, Operator, load, dump
 
 from libmuscle import Instance, Message
 from libmuscle.manager.run_dir import RunDir
@@ -127,7 +127,7 @@ def stateless_micro():
     instance = Instance({
             Operator.F_INIT: ['f_i'],
             Operator.O_F: ['o_f']},
-            stateful=ImplementationState.STATELESS)
+            keeps_state_for_next_use=KeepsStateForNextUse.NO)
 
     while instance.reuse_instance():
         dt = instance.get_setting('dt', 'float')
@@ -149,7 +149,7 @@ def data_transformer():
     instance = Instance({
             Operator.F_INIT: ['f_i'],
             Operator.O_F: ['o_f']},
-            stateful=ImplementationState.STATELESS)
+            keeps_state_for_next_use=KeepsStateForNextUse.NO)
 
     while instance.reuse_instance():
         msg = instance.receive('f_i')
diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index bb559636..e4e685f7 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -7,7 +7,7 @@
 from typing_extensions import Literal
 
 from ymmsl import (Identifier, Operator, SettingValue, Port, Reference,
-                   Settings, ImplementationState)
+                   Settings, KeepsStateForNextUse)
 
 from libmuscle.api_guard import APIGuard
 from libmuscle.checkpoint_triggers import TriggerManager
@@ -35,22 +35,23 @@ class Instance:
     This class provides a low-level send/receive API for the instance
     to use.
     """
-    def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None,
-                 stateful: ImplementationState = ImplementationState.STATEFUL
-                 ) -> None:
+    def __init__(
+            self, ports: Optional[Dict[Operator, List[str]]] = None,
+            keeps_state_for_next_use: KeepsStateForNextUse
+            = KeepsStateForNextUse.NECESSARY) -> None:
         """Create an Instance.
 
         Args:
             ports: A list of port names for each
                 :external:py:class:`~ymmsl.Operator` of this component.
-            stateful: Indicate whether this instance carries state between
-                iterations of the reuse loop. See
-                :external:py:class:`ymmsl.ImplementationState` for a description
-                of the options.
+            keeps_state_for_next_use: Indicate whether this instance carries
+                state between iterations of the reuse loop. See
+                :external:py:class:`ymmsl.KeepsStateForNextUse` for a
+                description of the options.
         """
         self.__is_shut_down = False
 
-        self._stateful = ImplementationState(stateful)
+        self._keeps_state = KeepsStateForNextUse(keeps_state_for_next_use)
 
         # Note that these are accessed by Muscle3, but otherwise private.
         self._name, self._index = self.__make_full_name()
@@ -183,7 +184,7 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool:
         do_implicit_checkpoint = (
                 not self._first_run and
                 not self._api_guard.uses_checkpointing() and
-                self._stateful is not ImplementationState.STATEFUL)
+                self._keeps_state is not KeepsStateForNextUse.NECESSARY)
 
         if do_implicit_checkpoint:
             if self._trigger_manager.should_save_final_snapshot(
diff --git a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py
index 6b9838e6..2d71630b 100644
--- a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py
+++ b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py
@@ -5,7 +5,7 @@
 import pytest
 from ymmsl import (
         Configuration, Model, Component, Conduit, Implementation,
-        ImplementationState as IState, Reference)
+        KeepsStateForNextUse, Reference)
 
 from libmuscle.manager.snapshot_registry import (
     SnapshotNode, SnapshotRegistry, calc_consistency, calc_consistency_list,
@@ -35,14 +35,14 @@ def macro_micro(micro_is_stateless: bool) -> Configuration:
 
     if micro_is_stateless:
         micro_impl = Implementation(
-                'micro_impl', stateful=IState.STATELESS, executable='pass')
+                'micro_impl',
+                keeps_state_for_next_use=KeepsStateForNextUse.NO,
+                executable='pass')
     else:
-        micro_impl = Implementation(
-                'micro_impl', supports_checkpoint=True, executable='pass')
+        micro_impl = Implementation('micro_impl', executable='pass')
 
     implementations = [
-            Implementation(
-                    'macro_impl', supports_checkpoint=True, executable='pass'),
+            Implementation('macro_impl', executable='pass'),
             micro_impl]
 
     return Configuration(model, implementations=implementations)
@@ -60,9 +60,9 @@ def uq(macro_micro: Configuration) -> Configuration:
             Conduit('rr.back_out', 'macro.muscle_settings_in'),
             Conduit('macro.final_state_out', 'rr.back_in')])
     macro_micro.implementations[Reference('qmc_impl')] = Implementation(
-            'qmc_impl', supports_checkpoint=True, executable='pass')
+            'qmc_impl', executable='pass')
     macro_micro.implementations[Reference('rr_impl')] = Implementation(
-            'rr_impl', supports_checkpoint=True, executable='pass')
+            'rr_impl', executable='pass')
     return macro_micro
 
 
@@ -381,7 +381,7 @@ def test_heuristic_rollbacks() -> None:
     conduits = [Conduit(f'comp{i}.o_f', f'comp{i+1}.f_i') for i in range(3)]
     model = Model('linear', components, conduits)
     implementations = [
-            Implementation(f'impl{i}', supports_checkpoint=True, script='xyz')
+            Implementation(f'impl{i}', script='xyz')
             for i in range(4)]
     config = Configuration(model, implementations=implementations)
 

From 4dfbd56277864346d23744e8bab65d970947d70b Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Wed, 11 Jan 2023 10:03:20 +0100
Subject: [PATCH 127/183] Update tox.ini to refer to ymmsl@develop branch

---
 tox.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tox.ini b/tox.ini
index 006e8901..020ee9e2 100644
--- a/tox.ini
+++ b/tox.ini
@@ -8,7 +8,7 @@ deps =
     flake8<6
     pytest
     pytest-cov
-    git+https://github.com/multiscale/ymmsl-python.git@feature/checkpointing#egg=ymmsl
+    git+https://github.com/multiscale/ymmsl-python.git@develop#egg=ymmsl
 
 passenv =
     MUSCLE_TEST_PYTHON_ONLY

From bc43c516ec82c9ea0ec369876f0fc8aa1edb8ae0 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Wed, 11 Jan 2023 10:04:17 +0100
Subject: [PATCH 128/183] Fix mypy error (np.bool8 -> np.bool_)

Fixes #147
---
 libmuscle/python/libmuscle/mpp_message.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libmuscle/python/libmuscle/mpp_message.py b/libmuscle/python/libmuscle/mpp_message.py
index c57effb3..976f9f5e 100644
--- a/libmuscle/python/libmuscle/mpp_message.py
+++ b/libmuscle/python/libmuscle/mpp_message.py
@@ -93,7 +93,7 @@ def _decode_grid(code: int, data: bytes) -> Grid:
             ExtTypeId.GRID_INT64: np.int64,
             ExtTypeId.GRID_FLOAT32: np.float32,
             ExtTypeId.GRID_FLOAT64: np.float64,
-            ExtTypeId.GRID_BOOL: np.bool8}
+            ExtTypeId.GRID_BOOL: np.bool_}
 
     order_map = {
             'fa': 'F',

From 6a9fea67d16f1088540ea3a9f63bcd769403a293 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Wed, 11 Jan 2023 11:06:48 +0100
Subject: [PATCH 129/183] Use ymmsl@develop for the examples as well

---
 docs/source/examples/python/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/examples/python/requirements.txt b/docs/source/examples/python/requirements.txt
index fa14df52..cabe0c71 100644
--- a/docs/source/examples/python/requirements.txt
+++ b/docs/source/examples/python/requirements.txt
@@ -3,6 +3,6 @@ numpy<1.22; python_version=='3.7'
 numpy>=1.22,<=1.25; python_version>='3.8'
 sobol_seq==0.2.0
 yatiml==0.9.0
-ymmsl>=0.12.0,<0.13
+git+https://github.com/multiscale/ymmsl-python.git@develop#egg=ymmsl
 qcg-pilotjob==0.13.1
 

From db4659224215987582284777701f4967791be1b7 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Wed, 11 Jan 2023 13:55:48 +0100
Subject: [PATCH 130/183] Remove `snapshots_enabled` API call

---
 libmuscle/python/libmuscle/checkpoint_triggers.py |  5 -----
 libmuscle/python/libmuscle/instance.py            | 12 ------------
 2 files changed, 17 deletions(-)

diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py
index b0064c75..d639f48a 100644
--- a/libmuscle/python/libmuscle/checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/checkpoint_triggers.py
@@ -211,11 +211,6 @@ def harmonise_wall_time(self, at_least: float) -> None:
                     at_least - cur)
             self._mono_to_elapsed += at_least - cur
 
-    def snapshots_enabled(self) -> bool:
-        """Check if the current workflow has snapshots enabled.
-        """
-        return self._has_checkpoints
-
     def should_save_snapshot(self, timestamp: float) -> bool:
         """Handles instance.should_save_snapshot
         """
diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index e4e685f7..732f6919 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -446,18 +446,6 @@ def receive_with_settings(
         """
         return self.__receive_message(port_name, slot, default, True)
 
-    def snapshots_enabled(self) -> bool:
-        """Check if the current workflow has snapshots enabled.
-
-        When snapshots are not enabled, all calls to
-        :meth:`should_save_snapshot` and :meth:`should_save_final_snapshot` will
-        return False.
-
-        Returns:
-            True iff checkpoint rules are defined in the workflow yMMSL.
-        """
-        return self._trigger_manager.snapshots_enabled()
-
     def resuming(self) -> bool:
         """Check if this instance is resuming from a snapshot.
 

From d9c395057426592ff85538b12279f145a35c395c Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Wed, 11 Jan 2023 14:14:22 +0100
Subject: [PATCH 131/183] Remove "sim_reset" logic

---
 libmuscle/python/libmuscle/checkpoint_triggers.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py
index d639f48a..434f5a5d 100644
--- a/libmuscle/python/libmuscle/checkpoint_triggers.py
+++ b/libmuscle/python/libmuscle/checkpoint_triggers.py
@@ -189,7 +189,6 @@ def set_checkpoint_info(
         self._sim = CombinedCheckpointTriggers(checkpoints.simulation_time)
         self._prevsim = None        # type: Optional[float]
         self._nextsim = None        # type: Optional[float]
-        self._sim_reset = True
 
     def elapsed_walltime(self) -> float:
         """Returns elapsed wallclock_time in seconds.
@@ -237,7 +236,6 @@ def should_save_final_snapshot(
             # message.
             _logger.debug('Reuse triggered by muscle_settings_in.'
                           ' Not creating a snapshot.')
-            self._sim_reset = True
         else:
             value = self.__should_save(f_init_max_timestamp)
 
@@ -256,10 +254,6 @@ def update_checkpoints(self, timestamp: float) -> None:
         self._prevsim = timestamp
         self._nextsim = self._sim.next_checkpoint(timestamp)
 
-        # this method is also called during resume, after which we no longer
-        # consider the simulation_time as reset
-        self._sim_reset = False
-
     def get_triggers(self) -> List[str]:
         """Get trigger description(s) for the current reason for checkpointing.
         """
@@ -273,7 +267,7 @@ def __should_save(self, simulation_time: float) -> bool:
         Args:
             simulation_time: current/next timestamp as reported by the instance
         """
-        if self._sim_reset:
+        if self._nextsim is None and self._prevsim is None:
             # we cannot make assumptions about the start time of a simulation,
             # a t=-1000 could make sense if t represents years since CE
             # and we should not disallow checkpointing for negative t
@@ -284,7 +278,6 @@ def __should_save(self, simulation_time: float) -> bool:
                 self._nextsim = previous
             else:
                 self._nextsim = self._sim.next_checkpoint(simulation_time)
-            self._sim_reset = False
 
         walltime = self.elapsed_walltime()
         self._cpts_considered_until = walltime

From 9553c9eaba28a838fc8a6a18af360d11db8436ec Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Wed, 11 Jan 2023 17:10:47 +0100
Subject: [PATCH 132/183] Implement InstanceFlags in Python

Deprecates supplying `apply_overlay` in Instance.reuse_instance.

Note: UQ examples are not yet updated.
---
 .../examples/python/interact_coupling.py      |   4 +-
 integration_test/test_parameter_overlays.py   |   7 +-
 .../test_snapshot_complex_coupling.py         |  11 +-
 integration_test/test_snapshot_dispatch.py    |   9 +-
 integration_test/test_snapshot_interact.py    |   4 +-
 integration_test/test_snapshot_macro_micro.py |  15 ++-
 libmuscle/python/libmuscle/__init__.py        |  11 +-
 libmuscle/python/libmuscle/api_guard.py       |  47 ++-----
 libmuscle/python/libmuscle/instance.py        | 120 +++++++++++++-----
 libmuscle/python/libmuscle/test/conftest.py   |   2 +-
 .../python/libmuscle/test/test_api_guard.py   |  20 ++-
 11 files changed, 144 insertions(+), 106 deletions(-)

diff --git a/docs/source/examples/python/interact_coupling.py b/docs/source/examples/python/interact_coupling.py
index 3df5e11e..d0e6a836 100644
--- a/docs/source/examples/python/interact_coupling.py
+++ b/docs/source/examples/python/interact_coupling.py
@@ -1,7 +1,7 @@
 import logging
 from typing import Any, Optional, Tuple, Dict
 
-from libmuscle import Instance, Message
+from libmuscle import Instance, Message, USES_CHECKPOINT_API
 from libmuscle.runner import run_simulation
 from ymmsl import (
         Component, Conduit, Configuration, Model, Operator, Ports, Settings)
@@ -275,7 +275,7 @@ def checkpointing_temporal_coupler() -> None:
     """
     instance = Instance({
         Operator.O_I: ['a_out', 'b_out'],
-        Operator.S: ['a_in', 'b_in']})
+        Operator.S: ['a_in', 'b_in']}, USES_CHECKPOINT_API)
 
     while instance.reuse_instance():
         if instance.resuming():
diff --git a/integration_test/test_parameter_overlays.py b/integration_test/test_parameter_overlays.py
index cf091193..5b772d10 100644
--- a/integration_test/test_parameter_overlays.py
+++ b/integration_test/test_parameter_overlays.py
@@ -3,7 +3,7 @@
 from ymmsl import (Component, Conduit, Configuration, Model, Operator,
                    Settings)
 
-from libmuscle import Instance, Message
+from libmuscle import Instance, Message, DONT_APPLY_OVERLAY
 from libmuscle.runner import run_simulation
 
 
@@ -49,9 +49,10 @@ def explicit_relay():
     having MUSCLE handle them. This just passes all information on.
     """
     instance = Instance({
-            Operator.F_INIT: ['in[]'], Operator.O_F: ['out[]']})
+            Operator.F_INIT: ['in[]'], Operator.O_F: ['out[]']},
+            DONT_APPLY_OVERLAY)
 
-    while instance.reuse_instance(False):
+    while instance.reuse_instance():
         # f_init
         assert instance.get_setting('test2', 'float') == 13.3
         assert instance.get_port_length('in') == instance.get_port_length(
diff --git a/integration_test/test_snapshot_complex_coupling.py b/integration_test/test_snapshot_complex_coupling.py
index dad2ee34..e3b408de 100644
--- a/integration_test/test_snapshot_complex_coupling.py
+++ b/integration_test/test_snapshot_complex_coupling.py
@@ -2,9 +2,10 @@
 import time
 
 import pytest
-from ymmsl import KeepsStateForNextUse, Operator, load, dump
+from ymmsl import Operator, load, dump
 
-from libmuscle import Instance, Message
+from libmuscle import (
+        Instance, Message, HAS_NO_STATE_FOR_NEXT_USE, USES_CHECKPOINT_API)
 from libmuscle.manager.run_dir import RunDir
 
 from .conftest import run_manager_with_actors, ls_snapshots
@@ -18,7 +19,7 @@ def cache_component(max_channels=2):
              Operator.O_I: [f'sub_out{i+1}' for i in range(max_channels)],
              Operator.S: [f'sub_in{i+1}' for i in range(max_channels)],
              Operator.O_F: [f'out{i+1}' for i in range(max_channels)]}
-    instance = Instance(ports)
+    instance = Instance(ports, USES_CHECKPOINT_API)
 
     cache_t = float('-inf')
     cache_data = []
@@ -57,7 +58,7 @@ def cache_component(max_channels=2):
 def echo_component(max_channels=2):
     ports = {Operator.F_INIT: [f'in{i+1}' for i in range(max_channels)],
              Operator.O_F: [f'out{i+1}' for i in range(max_channels)]}
-    instance = Instance(ports, keeps_state_for_next_use=KeepsStateForNextUse.NO)
+    instance = Instance(ports, HAS_NO_STATE_FOR_NEXT_USE)
 
     while instance.reuse_instance():
         for p_in, p_out in zip(ports[Operator.F_INIT], ports[Operator.O_F]):
@@ -69,7 +70,7 @@ def main_component():
     instance = Instance({
             Operator.O_I: ['state_out'],
             Operator.S: ['Ai', 'Bi', 'Ci', 'Di'],
-            Operator.O_F: ['o_f']})
+            Operator.O_F: ['o_f']}, USES_CHECKPOINT_API)
 
     while instance.reuse_instance():
         dt = instance.get_setting('dt', 'float')
diff --git a/integration_test/test_snapshot_dispatch.py b/integration_test/test_snapshot_dispatch.py
index 7102a43c..f604663e 100644
--- a/integration_test/test_snapshot_dispatch.py
+++ b/integration_test/test_snapshot_dispatch.py
@@ -1,7 +1,8 @@
 import pytest
-from ymmsl import KeepsStateForNextUse, Operator, load, dump
+from ymmsl import Operator, load, dump
 
-from libmuscle import Instance, Message
+from libmuscle import (
+        Instance, Message, HAS_NO_STATE_FOR_NEXT_USE, USES_CHECKPOINT_API)
 from libmuscle.manager.run_dir import RunDir
 
 from .conftest import run_manager_with_actors, ls_snapshots
@@ -13,7 +14,7 @@
 def component():
     instance = Instance({
             Operator.F_INIT: ['f_i'],
-            Operator.O_F: ['o_f']})
+            Operator.O_F: ['o_f']}, USES_CHECKPOINT_API)
 
     while instance.reuse_instance():
         dt = instance.get_setting('dt', 'float')
@@ -47,7 +48,7 @@ def stateless_component():
     instance = Instance({
             Operator.F_INIT: ['f_i'],
             Operator.O_F: ['o_f']},
-            stateful=KeepsStateForNextUse.NO)
+            HAS_NO_STATE_FOR_NEXT_USE)
 
     while instance.reuse_instance():
         dt = instance.get_setting('dt', 'float')
diff --git a/integration_test/test_snapshot_interact.py b/integration_test/test_snapshot_interact.py
index 5492f9e2..1fc2a5d8 100644
--- a/integration_test/test_snapshot_interact.py
+++ b/integration_test/test_snapshot_interact.py
@@ -5,7 +5,7 @@
 import pytest
 from ymmsl import Operator, load, dump
 
-from libmuscle import Instance, Message
+from libmuscle import Instance, Message, USES_CHECKPOINT_API
 from libmuscle.manager.run_dir import RunDir
 
 from .conftest import run_manager_with_actors, ls_snapshots
@@ -21,7 +21,7 @@
 def component():
     instance = Instance({
             Operator.O_I: ['o_i'],
-            Operator.S: ['s']})
+            Operator.S: ['s']}, USES_CHECKPOINT_API)
 
     while instance.reuse_instance():
         t0 = instance.get_setting('t0', 'float')
diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py
index 885ac704..2e660606 100644
--- a/integration_test/test_snapshot_macro_micro.py
+++ b/integration_test/test_snapshot_macro_micro.py
@@ -1,7 +1,8 @@
 import pytest
-from ymmsl import KeepsStateForNextUse, Operator, load, dump
+from ymmsl import Operator, load, dump
 
-from libmuscle import Instance, Message
+from libmuscle import (
+        Instance, Message, HAS_NO_STATE_FOR_NEXT_USE, USES_CHECKPOINT_API)
 from libmuscle.manager.run_dir import RunDir
 
 from .conftest import run_manager_with_actors, ls_snapshots
@@ -13,7 +14,7 @@
 def macro():
     instance = Instance({
             Operator.O_I: ['o_i'],
-            Operator.S: ['s']})
+            Operator.S: ['s']}, USES_CHECKPOINT_API)
 
     while instance.reuse_instance():
         dt = instance.get_setting('dt', 'float')
@@ -52,7 +53,7 @@ def macro():
 def macro_vector():
     instance = Instance({
             Operator.O_I: ['o_i[]'],
-            Operator.S: ['s[]']})
+            Operator.S: ['s[]']}, USES_CHECKPOINT_API)
 
     while instance.reuse_instance():
         dt = instance.get_setting('dt', 'float')
@@ -93,7 +94,7 @@ def macro_vector():
 def micro():
     instance = Instance({
             Operator.F_INIT: ['f_i'],
-            Operator.O_F: ['o_f']})
+            Operator.O_F: ['o_f']}, USES_CHECKPOINT_API)
 
     while instance.reuse_instance():
         dt = instance.get_setting('dt', 'float')
@@ -127,7 +128,7 @@ def stateless_micro():
     instance = Instance({
             Operator.F_INIT: ['f_i'],
             Operator.O_F: ['o_f']},
-            keeps_state_for_next_use=KeepsStateForNextUse.NO)
+            HAS_NO_STATE_FOR_NEXT_USE)
 
     while instance.reuse_instance():
         dt = instance.get_setting('dt', 'float')
@@ -149,7 +150,7 @@ def data_transformer():
     instance = Instance({
             Operator.F_INIT: ['f_i'],
             Operator.O_F: ['o_f']},
-            keeps_state_for_next_use=KeepsStateForNextUse.NO)
+            HAS_NO_STATE_FOR_NEXT_USE)
 
     while instance.reuse_instance():
         msg = instance.receive('f_i')
diff --git a/libmuscle/python/libmuscle/__init__.py b/libmuscle/python/libmuscle/__init__.py
index 707dbdae..49653a02 100644
--- a/libmuscle/python/libmuscle/__init__.py
+++ b/libmuscle/python/libmuscle/__init__.py
@@ -1,6 +1,6 @@
 from libmuscle.communicator import Message
 from libmuscle.grid import Grid
-from libmuscle.instance import Instance
+from libmuscle.instance import Instance, InstanceFlags
 from libmuscle.version import __version__
 from libmuscle import runner
 
@@ -8,4 +8,11 @@
 # Note that libmuscle.version above is created by the build system; it's okay
 # that it's not present.
 
-__all__ = ['__version__', 'Grid', 'Instance', 'Message', 'runner']
+__all__ = [
+        '__version__', 'Grid', 'Instance', 'InstanceFlags', 'Message', 'runner']
+
+
+# export InstanceFlag members to the module namespace
+# adapted from https://github.com/python/cpython/blob/3.10/Lib/re.py#L179
+globals().update(InstanceFlags.__members__)
+__all__.extend(InstanceFlags.__members__)
diff --git a/libmuscle/python/libmuscle/api_guard.py b/libmuscle/python/libmuscle/api_guard.py
index 56012448..6fa7ee51 100644
--- a/libmuscle/python/libmuscle/api_guard.py
+++ b/libmuscle/python/libmuscle/api_guard.py
@@ -1,5 +1,4 @@
 from enum import auto, Enum
-from typing import Optional
 
 
 class APIPhase(Enum):
@@ -28,9 +27,6 @@ class APIPhase(Enum):
     BEFORE_REUSE_INSTANCE = auto()
     """Before calling reuse_instance"""
 
-    AFTER_REUSE_INSTANCE = auto()
-    """At the top of the reuse loop"""
-
     BEFORE_RESUMING = auto()
     """Between reuse_instance and resuming"""
 
@@ -62,13 +58,13 @@ class APIGuard:
     called to signal that the corresponding function finished
     successfully, and that we are moving on to the next phase.
     """
-    def __init__(self) -> None:
+    def __init__(self, uses_checkpointing: bool) -> None:
         """Create an APIPhaseTracker.
 
         This starts the tracker in BEFORE_FIRST_REUSE_INSTANCE.
         """
         self._phase = APIPhase.BEFORE_FIRST_REUSE_INSTANCE
-        self._uses_checkpointing = None     # type: Optional[bool]
+        self._uses_checkpointing = uses_checkpointing
 
     def _generic_error_messages(self, verify_phase: str) -> None:
         if self._phase in (
@@ -80,10 +76,6 @@ def _generic_error_messages(self, verify_phase: str) -> None:
                     'Please do not call {verify_phase} after'
                     ' should_save_final_snapshot. should_save_final_snapshot'
                     ' should be at the end of the reuse loop.')
-        elif self._phase == APIPhase.AFTER_REUSE_INSTANCE:
-            msg = (
-                    'Please call resuming first in the reuse loop, before'
-                    f' {verify_phase}')
         elif self._phase == APIPhase.BEFORE_RESUMING:
             msg = 'Inside the reuse loop you must call resuming first.'
         elif self._phase == APIPhase.BEFORE_LOAD_SNAPSHOT:
@@ -106,30 +98,9 @@ def _generic_error_messages(self, verify_phase: str) -> None:
             return
         raise RuntimeError(msg)
 
-    def uses_checkpointing(self) -> bool:
-        """Return whether the code is using checkpointing.
-
-        We can only determine that the code doesn't use checkpointing
-        if there are no checkpointing calls between the first and
-        second calls to reuse_instance. So this function should only
-        be called after the second call to verify_reuse_instance, or
-        it may raise if the code does not use checkpointing.
-
-        Raises:
-            RuntimeError: if we are at a point where we cannot know
-                the answer yet.
-        """
-        if self._uses_checkpointing is not None:
-            return self._uses_checkpointing
-        raise RuntimeError(
-                'The API was implemented incorrectly, please consult the'
-                ' documentation.')
-
     def verify_reuse_instance(self) -> None:
         """Check reuse_instance()"""
-        if self._phase == APIPhase.AFTER_REUSE_INSTANCE:
-            self._uses_checkpointing = False
-        elif self._phase not in (
+        if self._phase not in (
                 APIPhase.BEFORE_REUSE_INSTANCE,
                 APIPhase.BEFORE_FIRST_REUSE_INSTANCE):
             raise RuntimeError(
@@ -146,17 +117,19 @@ def reuse_instance_done(self, reusing: bool) -> None:
         if not reusing:
             self._phase = APIPhase.AFTER_REUSE_LOOP
         else:
-            if self._uses_checkpointing is None:
-                self._phase = APIPhase.AFTER_REUSE_INSTANCE
-            elif self._uses_checkpointing:
+            if self._uses_checkpointing:
                 self._phase = APIPhase.BEFORE_RESUMING
             else:
                 self._phase = APIPhase.BEFORE_REUSE_INSTANCE
 
     def verify_resuming(self) -> None:
         """Check resuming()"""
-        if self._phase not in (
-                APIPhase.BEFORE_RESUMING, APIPhase.AFTER_REUSE_INSTANCE):
+        if not self._uses_checkpointing:
+            raise RuntimeError(
+                    'Please add the flag'
+                    ' :attr:`InstanceFlag.USES_CHECKPOINT_API` to your'
+                    ' instance to use the MUSCLE3 checkpointing API.')
+        if self._phase != APIPhase.BEFORE_RESUMING:
             raise RuntimeError(
                     'Please call resuming() only as the first thing in the'
                     ' reuse loop.')
diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index e4e685f7..406dcdbd 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -1,13 +1,15 @@
 from copy import copy
+from enum import Flag, auto
 import logging
 import os
 import sys
 from typing import cast, Dict, List, Optional, Tuple, overload
 # TODO: import from typing module when dropping support for python 3.7
 from typing_extensions import Literal
+import warnings
 
 from ymmsl import (Identifier, Operator, SettingValue, Port, Reference,
-                   Settings, KeepsStateForNextUse)
+                   Settings)
 
 from libmuscle.api_guard import APIGuard
 from libmuscle.checkpoint_triggers import TriggerManager
@@ -29,6 +31,62 @@
 _FInitCacheType = Dict[Tuple[str, Optional[int]], Message]
 
 
+class InstanceFlags(Flag):
+    """Enumeration of properties that an instance may have.
+
+    You may combine multiple flags using the bitwise OR operator `|`. For
+    example:
+
+    .. code-block:: python
+
+        from libmuscle import (
+                Instance, USES_CHECKPOINT_API, KEEPS_STATE_FOR_NEXT_USE)
+
+        ports = ...
+        flags = USES_CHECKPOINT_API | KEEPS_STATE_FOR_NEXT_USE
+        instance = Instance(ports, flags)
+    """
+
+    DONT_APPLY_OVERLAY = auto()
+    """Do not apply the received settings overlay during prereceive of F_INIT
+    messages. If you're going to use :meth:`Instance.receive_with_settings` on
+    your F_INIT ports, you need to set this flag when creating an
+    :class:`Instance`.
+
+    If you don't know what that means, do not specify this flag and everything
+    will be fine. If it turns out that you did need to specify the flag, MUSCLE3
+    will tell you about it in an error message and you can add it still.
+    """
+
+    USES_CHECKPOINT_API = auto()
+    """Indicate that this instance supports checkpointing.
+
+    You may not use any checkpointing API calls when this flag is not supplied.
+    """
+
+    HAS_NO_STATE_FOR_NEXT_USE = auto()
+    """Indicate this instance does not carry state between iterations of the
+    reuse loop.
+
+    This corresponds to :external:py:attr:`ymmsl.KeepsStateForNextUse.NO`.
+
+    If neither :attr:`HAS_NO_STATE_FOR_NEXT_USE` and
+    :attr:`STATE_FOR_NEXT_USE_NOT_REQUIRED` are supplied, this corresponds to
+    :external:py:attr:`ymmsl.KeepsStateForNextUse.REQUIRED`.
+    """
+
+    STATE_FOR_NEXT_USE_NOT_REQUIRED = auto()
+    """Indicate this instance carries state between iterations of the
+    reuse loop, however this state is not required for restarting.
+
+    This corresponds to :external:py:attr:`ymmsl.KeepsStateForNextUse.HELPFUL`.
+
+    If neither :attr:`HAS_NO_STATE_FOR_NEXT_USE` and
+    :attr:`STATE_FOR_NEXT_USE_NOT_REQUIRED` are supplied, this corresponds to
+    :external:py:attr:`ymmsl.KeepsStateForNextUse.REQUIRED`.
+    """
+
+
 class Instance:
     """Represents a component instance in a MUSCLE3 simulation.
 
@@ -37,21 +95,19 @@ class Instance:
     """
     def __init__(
             self, ports: Optional[Dict[Operator, List[str]]] = None,
-            keeps_state_for_next_use: KeepsStateForNextUse
-            = KeepsStateForNextUse.NECESSARY) -> None:
+            flags: InstanceFlags = InstanceFlags(0)) -> None:
         """Create an Instance.
 
         Args:
             ports: A list of port names for each
                 :external:py:class:`~ymmsl.Operator` of this component.
-            keeps_state_for_next_use: Indicate whether this instance carries
-                state between iterations of the reuse loop. See
-                :external:py:class:`ymmsl.KeepsStateForNextUse` for a
-                description of the options.
+            flags: Indicate properties for this instance. See
+                :py:class:`InstanceFlags` for a detailed description of possible
+                flags.
         """
         self.__is_shut_down = False
 
-        self._keeps_state = KeepsStateForNextUse(keeps_state_for_next_use)
+        self._flags = InstanceFlags(flags)
 
         # Note that these are accessed by Muscle3, but otherwise private.
         self._name, self._index = self.__make_full_name()
@@ -63,7 +119,8 @@ def __init__(
 
         self.__set_up_logging()
 
-        self._api_guard = APIGuard()
+        self._api_guard = APIGuard(
+                InstanceFlags.USES_CHECKPOINT_API in self._flags)
         """Checks that the user uses the API correctly."""
 
         self._profiler = Profiler(self._instance_name(), self.__manager)
@@ -126,7 +183,7 @@ def __init__(
         self._set_local_log_level()
         self._set_remote_log_level()
 
-    def reuse_instance(self, apply_overlay: bool = True) -> bool:
+    def reuse_instance(self, apply_overlay: Optional[bool] = None) -> bool:
         """Decide whether to run this instance again.
 
         In a multiscale simulation, instances get reused all the time.
@@ -149,16 +206,6 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool:
         i.e. before the F_INIT operator, and its return value should
         decide whether to enter that loop again.
 
-        Args:
-            apply_overlay: Whether to apply the received settings
-                overlay or to save it. If you're going to use
-                :meth:`receive_with_settings` on your F_INIT ports,
-                set this to False. If you don't know what that means,
-                just call :meth:`reuse_instance()` without specifying this
-                and everything will be fine. If it turns out that you
-                did need to specify False, MUSCLE3 will tell you about
-                it in an error message and you can add it still.
-
         Raises:
             RuntimeError:
                 When implementing the checkpointing API, but libmuscle detected
@@ -183,8 +230,9 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool:
 
         do_implicit_checkpoint = (
                 not self._first_run and
-                not self._api_guard.uses_checkpointing() and
-                self._keeps_state is not KeepsStateForNextUse.NECESSARY)
+                InstanceFlags.USES_CHECKPOINT_API not in self._flags and
+                (InstanceFlags.STATE_FOR_NEXT_USE_NOT_REQUIRED in self._flags or
+                 InstanceFlags.HAS_NO_STATE_FOR_NEXT_USE in self._flags))
 
         if do_implicit_checkpoint:
             if self._trigger_manager.should_save_final_snapshot(
@@ -567,7 +615,7 @@ def save_snapshot(self, message: Message) -> None:
         self._save_snapshot(message, False)
         self._api_guard.save_snapshot_done()
 
-    def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool:
+    def should_save_final_snapshot(self) -> bool:
         """Check if a snapshot should be saved at the end of the reuse loop.
 
         This method checks if a snapshot should be saved now.
@@ -601,7 +649,7 @@ def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool:
         """
         self._api_guard.verify_should_save_final_snapshot()
 
-        self._do_reuse = self._decide_reuse_instance(apply_overlay)
+        self._do_reuse = self._decide_reuse_instance()
         result = self._trigger_manager.should_save_final_snapshot(
                 self._do_reuse, self.__f_init_max_timestamp)
 
@@ -718,7 +766,8 @@ def __set_up_logging(self) -> None:
                                                      self.__manager)
             logging.getLogger().addHandler(self._mmp_handler)
 
-    def _decide_reuse_instance(self, apply_overlay: bool) -> bool:
+    def _decide_reuse_instance(
+            self, apply_overlay: Optional[bool] = None) -> bool:
         """Decide whether and how to reuse the instance.
 
         This sets self._first_run, self._do_resume and self._do_init, and
@@ -801,10 +850,11 @@ def __receive_message(
                 if with_settings and msg.settings is None:
                     err_msg = ('If you use receive_with_settings()'
                                ' on an F_INIT port, then you have to'
-                               ' pass apply_overlay=False to reuse_instance() '
-                               ' and should_save_final_snapshot(),'
-                               ' if applicable, otherwise the settings will'
-                               ' already have been applied by MUSCLE.')
+                               ' set the flag'
+                               ' :attr:`InstanceFlag.DONT_APPLY_OVERLAY` when'
+                               ' creating the :class:`Instance`, otherwise the'
+                               ' settings will already have been applied by'
+                               ' MUSCLE.')
                     self.__shutdown(err_msg)
                     raise RuntimeError(err_msg)
             else:
@@ -922,7 +972,7 @@ def _have_f_init_connections(self) -> bool:
                  for port in ports.get(Operator.F_INIT, [])])
         return f_init_connected or self._communicator.settings_in_connected()
 
-    def _pre_receive(self, apply_overlay: bool) -> bool:
+    def _pre_receive(self, apply_overlay: Optional[bool]) -> bool:
         """Pre-receives on all ports.
 
         This includes muscle_settings_in and all user-defined ports.
@@ -965,12 +1015,20 @@ def __receive_settings(self) -> bool:
         self._trigger_manager.harmonise_wall_time(saved_until)
         return True
 
-    def __pre_receive_f_init(self, apply_overlay: bool) -> None:
+    def __pre_receive_f_init(self, apply_overlay: Optional[bool]) -> None:
         """Receives on all ports connected to F_INIT.
 
         This receives all incoming messages on F_INIT and stores them
         in self._f_init_cache.
         """
+        if apply_overlay is not None:
+            warnings.warn(
+                    'Explicitly providing apply_overlay in reuse_instance is'
+                    ' deprecated. Use InstanceFlags.DONT_APPLY_OVERLAY when'
+                    ' creating the instance instead.', DeprecationWarning)
+        else:
+            apply_overlay = InstanceFlags.DONT_APPLY_OVERLAY not in self._flags
+
         def pre_receive(port_name: str, slot: Optional[int]) -> None:
             msg, saved_until = self._communicator.receive_message(
                     port_name, slot)
diff --git a/libmuscle/python/libmuscle/test/conftest.py b/libmuscle/python/libmuscle/test/conftest.py
index 77422ee9..201a10f4 100644
--- a/libmuscle/python/libmuscle/test/conftest.py
+++ b/libmuscle/python/libmuscle/test/conftest.py
@@ -26,4 +26,4 @@ def message2() -> Message:
 
 @pytest.fixture
 def guard() -> APIGuard:
-    return APIGuard()
+    return APIGuard(True)
diff --git a/libmuscle/python/libmuscle/test/test_api_guard.py b/libmuscle/python/libmuscle/test/test_api_guard.py
index f67bde93..4636c775 100644
--- a/libmuscle/python/libmuscle/test/test_api_guard.py
+++ b/libmuscle/python/libmuscle/test/test_api_guard.py
@@ -5,20 +5,17 @@
 from libmuscle.api_guard import APIGuard
 
 
-def test_no_checkpointing_support(guard):
+def test_no_checkpointing_support():
+    guard = APIGuard(False)
     for _ in range(3):
         guard.verify_reuse_instance()
         guard.reuse_instance_done(True)
 
-    assert not guard.uses_checkpointing()
-
     guard.verify_reuse_instance()
     guard.reuse_instance_done(False)
 
-    assert not guard.uses_checkpointing()
-
 
-def test_final_snapshot_only(guard):
+def test_final_snapshot_only(guard: APIGuard):
     for i in range(4):
         guard.verify_reuse_instance()
         guard.reuse_instance_done(True)
@@ -48,7 +45,7 @@ def test_final_snapshot_only(guard):
     guard.reuse_instance_done(False)
 
 
-def test_full_checkpointing(guard):
+def test_full_checkpointing(guard: APIGuard):
     for i in range(4):
         guard.verify_reuse_instance()
         guard.reuse_instance_done(True)
@@ -133,20 +130,19 @@ def test_missing_step(guard, fun):
     check_all_raise_except(guard, {fun})
 
 
-def test_missing_resuming(guard):
+def test_missing_resuming(guard: APIGuard):
     run_until_before(guard, APIGuard.verify_resuming)
-    check_all_raise_except(guard, {
-        APIGuard.verify_resuming, APIGuard.verify_reuse_instance})
+    check_all_raise_except(guard, {APIGuard.verify_resuming})
 
 
-def test_missing_should_save_final(guard):
+def test_missing_should_save_final(guard: APIGuard):
     run_until_before(guard, APIGuard.verify_should_save_final_snapshot)
     check_all_raise_except(guard, {
         APIGuard.verify_should_save_snapshot,
         APIGuard.verify_should_save_final_snapshot})
 
 
-def test_double_should_save(guard):
+def test_double_should_save(guard: APIGuard):
     run_until_before(guard, APIGuard.verify_should_save_snapshot)
     guard.verify_should_save_snapshot()
     guard.should_save_snapshot_done(True)

From 262968146b26f12b686692b2f286e346f489d538 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Wed, 11 Jan 2023 17:12:34 +0100
Subject: [PATCH 133/183] Fix docstring

---
 libmuscle/python/libmuscle/instance.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index 406dcdbd..d0db648a 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -40,10 +40,10 @@ class InstanceFlags(Flag):
     .. code-block:: python
 
         from libmuscle import (
-                Instance, USES_CHECKPOINT_API, KEEPS_STATE_FOR_NEXT_USE)
+                Instance, USES_CHECKPOINT_API, DONT_APPLY_OVERLAY)
 
         ports = ...
-        flags = USES_CHECKPOINT_API | KEEPS_STATE_FOR_NEXT_USE
+        flags = USES_CHECKPOINT_API | DONT_APPLY_OVERLAY
         instance = Instance(ports, flags)
     """
 

From c3177f667d58260e2c7422cefb270f7216f3be3d Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 12 Jan 2023 10:21:19 +0100
Subject: [PATCH 134/183] Process review comments

---
 .../test_snapshot_complex_coupling.py            |  4 ++--
 integration_test/test_snapshot_dispatch.py       |  4 ++--
 integration_test/test_snapshot_macro_micro.py    |  6 +++---
 libmuscle/python/libmuscle/instance.py           | 16 ++++++++--------
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/integration_test/test_snapshot_complex_coupling.py b/integration_test/test_snapshot_complex_coupling.py
index e3b408de..e374347d 100644
--- a/integration_test/test_snapshot_complex_coupling.py
+++ b/integration_test/test_snapshot_complex_coupling.py
@@ -5,7 +5,7 @@
 from ymmsl import Operator, load, dump
 
 from libmuscle import (
-        Instance, Message, HAS_NO_STATE_FOR_NEXT_USE, USES_CHECKPOINT_API)
+        Instance, Message, KEEPS_NO_STATE_FOR_NEXT_USE, USES_CHECKPOINT_API)
 from libmuscle.manager.run_dir import RunDir
 
 from .conftest import run_manager_with_actors, ls_snapshots
@@ -58,7 +58,7 @@ def cache_component(max_channels=2):
 def echo_component(max_channels=2):
     ports = {Operator.F_INIT: [f'in{i+1}' for i in range(max_channels)],
              Operator.O_F: [f'out{i+1}' for i in range(max_channels)]}
-    instance = Instance(ports, HAS_NO_STATE_FOR_NEXT_USE)
+    instance = Instance(ports, KEEPS_NO_STATE_FOR_NEXT_USE)
 
     while instance.reuse_instance():
         for p_in, p_out in zip(ports[Operator.F_INIT], ports[Operator.O_F]):
diff --git a/integration_test/test_snapshot_dispatch.py b/integration_test/test_snapshot_dispatch.py
index f604663e..021ac676 100644
--- a/integration_test/test_snapshot_dispatch.py
+++ b/integration_test/test_snapshot_dispatch.py
@@ -2,7 +2,7 @@
 from ymmsl import Operator, load, dump
 
 from libmuscle import (
-        Instance, Message, HAS_NO_STATE_FOR_NEXT_USE, USES_CHECKPOINT_API)
+        Instance, Message, KEEPS_NO_STATE_FOR_NEXT_USE, USES_CHECKPOINT_API)
 from libmuscle.manager.run_dir import RunDir
 
 from .conftest import run_manager_with_actors, ls_snapshots
@@ -48,7 +48,7 @@ def stateless_component():
     instance = Instance({
             Operator.F_INIT: ['f_i'],
             Operator.O_F: ['o_f']},
-            HAS_NO_STATE_FOR_NEXT_USE)
+            KEEPS_NO_STATE_FOR_NEXT_USE)
 
     while instance.reuse_instance():
         dt = instance.get_setting('dt', 'float')
diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py
index 2e660606..2f6bc1e2 100644
--- a/integration_test/test_snapshot_macro_micro.py
+++ b/integration_test/test_snapshot_macro_micro.py
@@ -2,7 +2,7 @@
 from ymmsl import Operator, load, dump
 
 from libmuscle import (
-        Instance, Message, HAS_NO_STATE_FOR_NEXT_USE, USES_CHECKPOINT_API)
+        Instance, Message, KEEPS_NO_STATE_FOR_NEXT_USE, USES_CHECKPOINT_API)
 from libmuscle.manager.run_dir import RunDir
 
 from .conftest import run_manager_with_actors, ls_snapshots
@@ -128,7 +128,7 @@ def stateless_micro():
     instance = Instance({
             Operator.F_INIT: ['f_i'],
             Operator.O_F: ['o_f']},
-            HAS_NO_STATE_FOR_NEXT_USE)
+            KEEPS_NO_STATE_FOR_NEXT_USE)
 
     while instance.reuse_instance():
         dt = instance.get_setting('dt', 'float')
@@ -150,7 +150,7 @@ def data_transformer():
     instance = Instance({
             Operator.F_INIT: ['f_i'],
             Operator.O_F: ['o_f']},
-            HAS_NO_STATE_FOR_NEXT_USE)
+            KEEPS_NO_STATE_FOR_NEXT_USE)
 
     while instance.reuse_instance():
         msg = instance.receive('f_i')
diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index d0db648a..a1e13ec4 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -64,25 +64,25 @@ class InstanceFlags(Flag):
     You may not use any checkpointing API calls when this flag is not supplied.
     """
 
-    HAS_NO_STATE_FOR_NEXT_USE = auto()
+    KEEPS_NO_STATE_FOR_NEXT_USE = auto()
     """Indicate this instance does not carry state between iterations of the
     reuse loop.
 
     This corresponds to :external:py:attr:`ymmsl.KeepsStateForNextUse.NO`.
 
-    If neither :attr:`HAS_NO_STATE_FOR_NEXT_USE` and
-    :attr:`STATE_FOR_NEXT_USE_NOT_REQUIRED` are supplied, this corresponds to
+    If neither :attr:`KEEPS_NO_STATE_FOR_NEXT_USE` and
+    :attr:`STATE_NOT_REQUIRED_FOR_NEXT_USE` are supplied, this corresponds to
     :external:py:attr:`ymmsl.KeepsStateForNextUse.REQUIRED`.
     """
 
-    STATE_FOR_NEXT_USE_NOT_REQUIRED = auto()
+    STATE_NOT_REQUIRED_FOR_NEXT_USE = auto()
     """Indicate this instance carries state between iterations of the
     reuse loop, however this state is not required for restarting.
 
     This corresponds to :external:py:attr:`ymmsl.KeepsStateForNextUse.HELPFUL`.
 
-    If neither :attr:`HAS_NO_STATE_FOR_NEXT_USE` and
-    :attr:`STATE_FOR_NEXT_USE_NOT_REQUIRED` are supplied, this corresponds to
+    If neither :attr:`KEEPS_NO_STATE_FOR_NEXT_USE` and
+    :attr:`STATE_NOT_REQUIRED_FOR_NEXT_USE` are supplied, this corresponds to
     :external:py:attr:`ymmsl.KeepsStateForNextUse.REQUIRED`.
     """
 
@@ -231,8 +231,8 @@ def reuse_instance(self, apply_overlay: Optional[bool] = None) -> bool:
         do_implicit_checkpoint = (
                 not self._first_run and
                 InstanceFlags.USES_CHECKPOINT_API not in self._flags and
-                (InstanceFlags.STATE_FOR_NEXT_USE_NOT_REQUIRED in self._flags or
-                 InstanceFlags.HAS_NO_STATE_FOR_NEXT_USE in self._flags))
+                (InstanceFlags.STATE_NOT_REQUIRED_FOR_NEXT_USE in self._flags or
+                 InstanceFlags.KEEPS_NO_STATE_FOR_NEXT_USE in self._flags))
 
         if do_implicit_checkpoint:
             if self._trigger_manager.should_save_final_snapshot(

From dda214bc507aa44c78d381da39d3685eca34376c Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 12 Jan 2023 13:35:28 +0100
Subject: [PATCH 135/183] Save and restore settings overlay

---
 libmuscle/python/libmuscle/instance.py             |  6 +++++-
 libmuscle/python/libmuscle/snapshot.py             | 12 +++++++++---
 libmuscle/python/libmuscle/snapshot_manager.py     | 10 +++++++---
 libmuscle/python/libmuscle/test/test_snapshot.py   |  9 ++++++---
 .../python/libmuscle/test/test_snapshot_manager.py | 14 +++++++++-----
 5 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index 732f6919..39bd0fe5 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -119,6 +119,10 @@ def __init__(
         resume_snapshot, snapshot_dir = checkpoint_info[2:4]
         saved_at = self._snapshot_manager.prepare_resume(
                 resume_snapshot, snapshot_dir)
+        # Resume settings overlay
+        overlay = self._snapshot_manager._resume_overlay
+        if overlay is not None:
+            self._settings_manager.overlay = overlay
 
         if saved_at is not None:
             self._trigger_manager.update_checkpoints(saved_at)
@@ -767,7 +771,7 @@ def _save_snapshot(
         walltime = self._trigger_manager.elapsed_walltime()
         timestamp = self._snapshot_manager.save_snapshot(
                 message, final, triggers, walltime,
-                f_init_max_timestamp)
+                f_init_max_timestamp, self._settings_manager.overlay)
         self._trigger_manager.update_checkpoints(timestamp)
 
     def __receive_message(
diff --git a/libmuscle/python/libmuscle/snapshot.py b/libmuscle/python/libmuscle/snapshot.py
index 2f86a220..cda03dc5 100644
--- a/libmuscle/python/libmuscle/snapshot.py
+++ b/libmuscle/python/libmuscle/snapshot.py
@@ -21,12 +21,16 @@ def __init__(self,
                  wallclock_time: float,
                  port_message_counts: Dict[str, List[int]],
                  is_final_snapshot: bool,
-                 message: Optional['communicator.Message']) -> None:
+                 message: Optional['communicator.Message'],
+                 settings_overlay: Settings) -> None:
         self.triggers = triggers
         self.wallclock_time = wallclock_time
         self.port_message_counts = port_message_counts
         self.is_final_snapshot = is_final_snapshot
         self.message = message
+        # self.message is None for implicit snapshots, so we cannot store the
+        # Settings overlay in that message object.
+        self.settings_overlay = settings_overlay
 
     @classmethod
     @abstractmethod
@@ -62,7 +66,8 @@ def from_bytes(cls, data: bytes) -> 'Snapshot':
                    dct['wallclock_time'],
                    dct['port_message_counts'],
                    dct['is_final_snapshot'],
-                   cls.bytes_to_message(dct['message']))
+                   cls.bytes_to_message(dct['message']),
+                   Settings(dct['settings_overlay']))
 
     def to_bytes(self) -> bytes:
         return cast(bytes, msgpack.dumps({
@@ -70,7 +75,8 @@ def to_bytes(self) -> bytes:
             'wallclock_time': self.wallclock_time,
             'port_message_counts': self.port_message_counts,
             'is_final_snapshot': self.is_final_snapshot,
-            'message': self.message_to_bytes(self.message)
+            'message': self.message_to_bytes(self.message),
+            'settings_overlay': self.settings_overlay.as_ordered_dict()
         }))
 
     @staticmethod
diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py
index f756d05d..e23b73dc 100644
--- a/libmuscle/python/libmuscle/snapshot_manager.py
+++ b/libmuscle/python/libmuscle/snapshot_manager.py
@@ -2,7 +2,7 @@
 from pathlib import Path
 from typing import cast, List, Optional
 
-from ymmsl import Reference, Operator
+from ymmsl import Reference, Operator, Settings
 
 from libmuscle.communicator import Communicator, Message
 from libmuscle.mmp_client import MMPClient
@@ -44,6 +44,7 @@ def __init__(self,
         self._manager = manager
 
         self._resume_from_snapshot = None   # type: Optional[Snapshot]
+        self._resume_overlay = Settings()
         self._next_snapshot_num = 1
 
     def prepare_resume(
@@ -73,6 +74,7 @@ def prepare_resume(
                 # snapshot.message is None for implicit snapshots
                 self._resume_from_snapshot = snapshot
                 result = snapshot.message.timestamp
+            self._resume_overlay = snapshot.settings_overlay
 
             self._communicator.restore_message_counts(
                 snapshot.port_message_counts)
@@ -112,7 +114,8 @@ def load_snapshot(self) -> Message:
     def save_snapshot(
             self, msg: Optional[Message], final: bool,
             triggers: List[str], wallclock_time: float,
-            f_init_max_timestamp: Optional[float] = None,
+            f_init_max_timestamp: Optional[float],
+            settings_overlay: Settings
             ) -> float:
         """Save a (final) snapshot.
 
@@ -140,7 +143,8 @@ def save_snapshot(
                 port_message_counts[port_name] = new_counts
 
         snapshot = MsgPackSnapshot(
-            triggers, wallclock_time, port_message_counts, final, msg)
+                triggers, wallclock_time, port_message_counts, final, msg,
+                settings_overlay)
 
         path = self.__store_snapshot(snapshot)
         metadata = SnapshotMetadata.from_snapshot(snapshot, str(path))
diff --git a/libmuscle/python/libmuscle/test/test_snapshot.py b/libmuscle/python/libmuscle/test/test_snapshot.py
index f459a001..372c8cd0 100644
--- a/libmuscle/python/libmuscle/test/test_snapshot.py
+++ b/libmuscle/python/libmuscle/test/test_snapshot.py
@@ -13,12 +13,15 @@ def snapshot() -> Snapshot:
     is_final = True
     message = Message(1.2, None, 'test_data')
     snapshot = MsgPackSnapshot(
-            triggers, wallclock_time, port_message_counts, is_final, message)
+            triggers, wallclock_time, port_message_counts, is_final, message,
+            Settings({'test': 1}))
     assert snapshot.triggers == triggers
     assert snapshot.wallclock_time == wallclock_time
     assert snapshot.port_message_counts == port_message_counts
     assert snapshot.is_final_snapshot == is_final
     assert snapshot.message == message
+    assert snapshot.settings_overlay.keys() == {'test'}
+    assert snapshot.settings_overlay['test'] == 1
     return snapshot
 
 
@@ -53,7 +56,7 @@ def test_snapshot_metadata(snapshot: Snapshot) -> None:
 
 def test_message_with_settings() -> None:
     message = Message(1.0, 2.0, 'test_data', Settings({'setting': True}))
-    snapshot = MsgPackSnapshot([], 0, {}, False, message)
+    snapshot = MsgPackSnapshot([], 0, {}, False, message, Settings())
     assert snapshot.message.settings.get('setting') is True
 
     binary_snapshot = snapshot.to_bytes()
@@ -65,7 +68,7 @@ def test_message_with_settings() -> None:
 
 def test_implicit_snapshot() -> None:
     message = None
-    snapshot = MsgPackSnapshot([], 0, {}, True, message)
+    snapshot = MsgPackSnapshot([], 0, {}, True, message, Settings())
     assert snapshot.message is None
 
     binary_snapshot = snapshot.to_bytes()
diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
index 31423bb0..e530ad06 100644
--- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py
+++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
@@ -1,7 +1,7 @@
 from pathlib import Path
 from unittest.mock import MagicMock
 
-from ymmsl import Reference
+from ymmsl import Reference, Settings
 
 from libmuscle.communicator import Message
 from libmuscle.snapshot import SnapshotMetadata
@@ -33,7 +33,8 @@ def test_save_load_snapshot(tmp_path: Path) -> None:
     assert not snapshot_manager.resuming_from_final()
 
     snapshot_manager.save_snapshot(
-            Message(0.2, None, 'test data'), False, ['test'], 13.0)
+            Message(0.2, None, 'test data'), False, ['test'], 13.0, None,
+            Settings())
 
     communicator.get_message_counts.assert_called_with()
     manager.submit_snapshot_metadata.assert_called()
@@ -63,7 +64,8 @@ def test_save_load_snapshot(tmp_path: Path) -> None:
     assert msg.data == 'test data'
 
     snapshot_manager2.save_snapshot(
-            Message(0.6, None, 'test data2'), True, ['test'], 42.2, 1.2)
+            Message(0.6, None, 'test data2'), True, ['test'], 42.2, 1.2,
+            Settings())
 
     instance, metadata = manager.submit_snapshot_metadata.call_args[0]
     assert instance == instance_id
@@ -99,7 +101,8 @@ def test_save_load_implicit_snapshot(tmp_path: Path) -> None:
     assert not snapshot_manager.resuming_from_intermediate()
     assert not snapshot_manager.resuming_from_final()
     # save implicit snapshot
-    snapshot_manager.save_snapshot(None, True, ['implicit'], 1.0, 1.5)
+    snapshot_manager.save_snapshot(
+            None, True, ['implicit'], 1.0, 1.5, Settings())
 
     manager.submit_snapshot_metadata.assert_called_once()
     instance, metadata = manager.submit_snapshot_metadata.call_args[0]
@@ -117,5 +120,6 @@ def test_save_load_implicit_snapshot(tmp_path: Path) -> None:
 
     assert not snapshot_manager2.resuming_from_intermediate()
     assert not snapshot_manager2.resuming_from_final()
-    snapshot_manager2.save_snapshot(None, True, ['implicit'], 12.3, 2.5)
+    snapshot_manager2.save_snapshot(
+            None, True, ['implicit'], 12.3, 2.5, Settings())
     manager.submit_snapshot_metadata.assert_called_once()

From f38cf18ee4c73c2d7fb8fe76e95534fc238a4814 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Thu, 12 Jan 2023 15:07:07 +0100
Subject: [PATCH 136/183] Implement version check

Fixes #148
---
 libmuscle/cpp/build/libmuscle/Makefile        | 22 +++++++----
 libmuscle/cpp/src/libmuscle/.gitignore        |  1 +
 libmuscle/cpp/src/libmuscle/mmp_client.cpp    |  4 +-
 libmuscle/cpp/src/libmuscle/version.h.in      |  2 +-
 .../python/libmuscle/manager/mmp_server.py    | 13 ++++++-
 .../manager/test/test_mmp_request_handler.py  | 38 ++++++++++++++++++-
 libmuscle/python/libmuscle/mmp_client.py      |  4 +-
 .../python/libmuscle/test/test_mmp_client.py  |  4 +-
 8 files changed, 73 insertions(+), 15 deletions(-)
 create mode 100644 libmuscle/cpp/src/libmuscle/.gitignore

diff --git a/libmuscle/cpp/build/libmuscle/Makefile b/libmuscle/cpp/build/libmuscle/Makefile
index 3fdc0117..d85d53b5 100644
--- a/libmuscle/cpp/build/libmuscle/Makefile
+++ b/libmuscle/cpp/build/libmuscle/Makefile
@@ -45,6 +45,7 @@ public_headers := libmuscle/data.hpp libmuscle/data.tpp libmuscle/instance.hpp
 public_headers += libmuscle/libmuscle.hpp libmuscle/mcp/data_pack.hpp
 public_headers += libmuscle/mcp/data_pack.tpp libmuscle/message.hpp
 public_headers += libmuscle/ports_description.hpp libmuscle/util.hpp libmuscle/util.tpp
+public_headers += libmuscle/version.h
 installed_headers := $(public_headers:%=$(PREFIX)/include/%)
 
 pkg_config_files := libmuscle.pc
@@ -76,7 +77,7 @@ test: tests
 
 .PHONY: clean
 clean:
-	rm -f libmuscle.a libmuscle.so libmuscle_d.a libmuscle_d.so version.h
+	rm -f libmuscle.a libmuscle.so libmuscle_d.a libmuscle_d.so $(srcdir)/version.h
 	rm -f libmuscle_mpi.a libmuscle_mpi.so libmuscle_mpi_d.a libmuscle_mpi_d.so
 	rm -f libmuscle.pc libmuscle_mpi.pc
 	rm -rf $(objdir)
@@ -116,11 +117,11 @@ LDFLAGS += $(shell export PKG_CONFIG_PATH=$(PKG_CONFIG_PATH):$(PKG_CONFIG_EXTRA_
 endif
 
 
-$(objdir)/%.d: %.cpp
+$(objdir)/%.d: %.cpp $(srcdir)/version.h
 	@mkdir -p $(@D)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -E -MM -MT $(@:.d=.o) $< -o $@
 
-$(objdir)/%.o: %.cpp
+$(objdir)/%.o: %.cpp $(srcdir)/version.h
 	@mkdir -p $(@D)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@
 
@@ -139,28 +140,29 @@ $(objdir)/%.dlo: %.cpp $(objdir)/%.o
 	@mkdir -p $(@D)
 	$(CXX) $(CPPFLAGS) $(CXXFLAGS) $(DEBUGFLAGS) -fPIC -c $< -o $@
 
-$(objdir)/%.mo: %.cpp
+$(objdir)/%.mo: %.cpp $(srcdir)/version.h
 	@mkdir -p $(@D)
 	$(MPICXX) $(CPPFLAGS) $(CXXFLAGS) $(MPIFLAGS) -c $< -o $@
 
-$(objdir)/%.mlo: %.cpp
+$(objdir)/%.mlo: %.cpp $(srcdir)/version.h
 	@mkdir -p $(@D)
 	$(MPICXX) $(CPPFLAGS) $(CXXFLAGS) $(MPIFLAGS) -fPIC -c $< -o $@
 
-$(objdir)/%.mdo: %.cpp
+$(objdir)/%.mdo: %.cpp $(srcdir)/version.h
 	@mkdir -p $(@D)
 	$(MPICXX) $(CPPFLAGS) $(CXXFLAGS) $(DEBUGFLAGS) $(MPIFLAGS) -c $< -o $@
 
-$(objdir)/%.mdlo: %.cpp
+$(objdir)/%.mdlo: %.cpp $(srcdir)/version.h
 	@mkdir -p $(@D)
 	$(MPICXX) $(CPPFLAGS) $(CXXFLAGS) $(DEBUGFLAGS) $(MPIFLAGS) -fPIC -c $< -o $@
 
 
-version.h: version.h.in
+$(srcdir)/version.h: version.h.in
 	cp $< $@
 	sed -i -e 's/@PROJECT_VERSION_MAJOR@/$(major_version)/' $@
 	sed -i -e 's/@PROJECT_VERSION_MINOR@/$(minor_version)/' $@
 	sed -i -e 's/@PROJECT_VERSION_PATCH@/$(patch_version)/' $@
+	sed -i -e 's/@PROJECT_VERSION@/$(muscle_version)/' $@
 
 libmuscle.a: $(objects)
 	ar rcs $@ $^
@@ -186,6 +188,10 @@ libmuscle_mpi_d.a: $(mdobjects)
 libmuscle_mpi_d.so: $(mdlobjects)
 	$(MPICXX) -shared -Wl,--version-script=libmuscle_mpi.version -o $@ $^ $(LDFLAGS)
 
+$(PREFIX)/include/libmuscle/version.h: $(srcdir)/version.h
+	@mkdir -p $(@D)
+	cp $< $@
+
 $(PREFIX)/include/%.hpp: $(hdrdir)/%.hpp
 	@mkdir -p $(@D)
 	cp $< $@
diff --git a/libmuscle/cpp/src/libmuscle/.gitignore b/libmuscle/cpp/src/libmuscle/.gitignore
new file mode 100644
index 00000000..67020331
--- /dev/null
+++ b/libmuscle/cpp/src/libmuscle/.gitignore
@@ -0,0 +1 @@
+version.h
diff --git a/libmuscle/cpp/src/libmuscle/mmp_client.cpp b/libmuscle/cpp/src/libmuscle/mmp_client.cpp
index de50e894..6a7f33d6 100644
--- a/libmuscle/cpp/src/libmuscle/mmp_client.cpp
+++ b/libmuscle/cpp/src/libmuscle/mmp_client.cpp
@@ -3,6 +3,7 @@
 #include "libmuscle/data.hpp"
 #include "libmuscle/mcp/data_pack.hpp"
 #include "libmuscle/mcp/protocol.hpp"
+#include "libmuscle/version.h"
 
 #include <chrono>
 #include <iterator>
@@ -101,7 +102,8 @@ void MMPClient::register_instance(
 
     auto request = Data::list(
             static_cast<int>(RequestType::register_instance),
-            std::string(name), encoded_locs, encoded_ports);
+            std::string(name), encoded_locs, encoded_ports,
+            MUSCLE3_VERSION);
 
     auto response = call_manager_(request);
 
diff --git a/libmuscle/cpp/src/libmuscle/version.h.in b/libmuscle/cpp/src/libmuscle/version.h.in
index 8edb3a47..67718812 100644
--- a/libmuscle/cpp/src/libmuscle/version.h.in
+++ b/libmuscle/cpp/src/libmuscle/version.h.in
@@ -4,5 +4,5 @@
 #define MUSCLE3_VERSION_MINOR @PROJECT_VERSION_MINOR@
 #define MUSCLE3_VERSION_PATCH @PROJECT_VERSION_PATCH@
 
-#define MUSCLE3_VERSION "MUSCLE3_VERSION_MAJOR.MUSCLE3_VERSION_MINOR.MUSCLE3_VERSION_PATCH"
+#define MUSCLE3_VERSION "@PROJECT_VERSION@"
 
diff --git a/libmuscle/python/libmuscle/manager/mmp_server.py b/libmuscle/python/libmuscle/manager/mmp_server.py
index d609fce1..6fc4bae2 100644
--- a/libmuscle/python/libmuscle/manager/mmp_server.py
+++ b/libmuscle/python/libmuscle/manager/mmp_server.py
@@ -8,6 +8,7 @@
         Conduit, Identifier, Operator, Port, Reference, PartialConfiguration,
         Checkpoints)
 
+import libmuscle
 from libmuscle.logging import LogLevel
 from libmuscle.manager.instance_registry import (
         AlreadyRegistered, InstanceRegistry)
@@ -109,12 +110,14 @@ def handle_request(self, request: bytes) -> bytes:
 
     def _register_instance(
             self, instance_id: str, locations: List[str],
-            ports: List[List[str]]) -> Any:
+            ports: List[List[str]], version: str = '') -> Any:
         """Handle a register instance request.
 
         Args:
             instance_id: ID of the instance to register
             locations: Locations where it can be reached
+            ports: Ports of this instance
+            version: Version of libmuscle that this instance uses
 
         Returns:
             A list containing the following values:
@@ -123,6 +126,14 @@ def _register_instance(
             error_msg (str): An error message, only present if status
                 equals ERROR
         """
+        if version != libmuscle.__version__:
+            return [
+                    ResponseType.ERROR.value,
+                    f'Instance libmuscle version ({version}) does not match'
+                    f' manager libmuscle version ({libmuscle.__version__}).'
+                    ' Please ensure that the instance and the manager use the'
+                    ' same version of libmuscle.']
+
         port_objs = [decode_port(p) for p in ports]
         instance = Reference(instance_id)
         try:
diff --git a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
index bc61f0a0..4b615d55 100644
--- a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
+++ b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py
@@ -6,6 +6,7 @@
 from ymmsl import (
         Operator, Reference, Checkpoints, CheckpointRangeRule, CheckpointAtRule)
 
+import libmuscle
 from libmuscle.logging import LogLevel
 from libmuscle.manager.mmp_server import MMPRequestHandler
 from libmuscle.mcp.protocol import RequestType, ResponseType
@@ -79,7 +80,8 @@ def test_register_instance(mmp_request_handler, instance_registry):
             RequestType.REGISTER_INSTANCE.value,
             'test_instance',
             ['tcp://localhost:10000'],
-            [['test_in', 'F_INIT']]]
+            [['test_in', 'F_INIT']],
+            libmuscle.__version__]
     encoded_request = msgpack.packb(request, use_bin_type=True)
 
     result = mmp_request_handler.handle_request(encoded_request)
@@ -94,6 +96,37 @@ def test_register_instance(mmp_request_handler, instance_registry):
     assert registered_ports['test_instance'][0].operator == Operator.F_INIT
 
 
+def test_register_instance_no_version(mmp_request_handler):
+    request = [
+            RequestType.REGISTER_INSTANCE.value,
+            'test_instance',
+            ['tcp://localhost:10000'],
+            [['test_in', 'F_INIT']]]
+    encoded_request = msgpack.packb(request, use_bin_type=True)
+
+    result = mmp_request_handler.handle_request(encoded_request)
+    decoded_result = msgpack.unpackb(result, raw=False)
+
+    assert decoded_result[0] == ResponseType.ERROR.value
+    assert 'version' in decoded_result[1]
+
+
+def test_register_instance_version_mismatch(mmp_request_handler):
+    request = [
+            RequestType.REGISTER_INSTANCE.value,
+            'test_instance',
+            ['tcp://localhost:10000'],
+            [['test_in', 'F_INIT']],
+            libmuscle.__version__ + "dev"]
+    encoded_request = msgpack.packb(request, use_bin_type=True)
+
+    result = mmp_request_handler.handle_request(encoded_request)
+    decoded_result = msgpack.unpackb(result, raw=False)
+
+    assert decoded_result[0] == ResponseType.ERROR.value
+    assert 'version' in decoded_result[1]
+
+
 def test_get_checkpoint_info(mmp_configuration, mmp_request_handler):
     resume_path = Path('/path/to/resume.pack')
     mmp_configuration.resume = {Reference('test_instance'): resume_path}
@@ -145,7 +178,8 @@ def test_double_register_instance(mmp_request_handler):
             RequestType.REGISTER_INSTANCE.value,
             'test_instance',
             ['tcp://localhost:10000'],
-            [['test_in', 'F_INIT']]]
+            [['test_in', 'F_INIT']],
+            libmuscle.__version__]
     encoded_request = msgpack.packb(request, use_bin_type=True)
 
     result = mmp_request_handler.handle_request(encoded_request)
diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py
index eed4d99a..1deded1e 100644
--- a/libmuscle/python/libmuscle/mmp_client.py
+++ b/libmuscle/python/libmuscle/mmp_client.py
@@ -9,6 +9,7 @@
         Conduit, Operator, Port, Reference, Settings, Checkpoints,
         CheckpointRule, CheckpointRangeRule, CheckpointAtRule)
 
+import libmuscle
 from libmuscle.mcp.protocol import RequestType, ResponseType
 from libmuscle.mcp.tcp_transport_client import TcpTransportClient
 from libmuscle.profiling import ProfileEvent
@@ -192,7 +193,8 @@ def register_instance(self, name: Reference, locations: List[str],
         request = [
                 RequestType.REGISTER_INSTANCE.value,
                 str(name), locations,
-                [encode_port(p) for p in ports]]
+                [encode_port(p) for p in ports],
+                libmuscle.__version__]
         response = self._call_manager(request)
         if response[0] == ResponseType.ERROR.value:
             raise RuntimeError(
diff --git a/libmuscle/python/libmuscle/test/test_mmp_client.py b/libmuscle/python/libmuscle/test/test_mmp_client.py
index d5051962..51874e5d 100644
--- a/libmuscle/python/libmuscle/test/test_mmp_client.py
+++ b/libmuscle/python/libmuscle/test/test_mmp_client.py
@@ -4,6 +4,7 @@
 import pytest
 from ymmsl import Conduit, Operator, Port, Reference
 
+import libmuscle
 from libmuscle.logging import LogLevel, LogMessage, Timestamp
 from libmuscle.mcp.protocol import RequestType, ResponseType
 from libmuscle.mmp_client import MMPClient
@@ -85,7 +86,8 @@ def test_register_instance(mocked_mmp_client) -> None:
     sent_msg = msgpack.unpackb(stub.call.call_args[0][0], raw=False)
     assert sent_msg == [
             RequestType.REGISTER_INSTANCE.value, 'kernel[13]',
-            ['direct:test', 'tcp:test'], [['out', 'O_I'], ['in', 'S']]]
+            ['direct:test', 'tcp:test'], [['out', 'O_I'], ['in', 'S']],
+            libmuscle.__version__]
 
 
 def test_request_peers(mocked_mmp_client) -> None:

From f0553c38a3319adb1eef0dcd3ad29c0bb9f2a92d Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 13 Jan 2023 10:19:42 +0100
Subject: [PATCH 137/183] Improve Message construction syntax

Fixes #146, #125

- Python: add default (None) values for next_timestamp and data in
  Message.__init__
- C++: add Message(timestamp) constructor
- Fortran: add LIBMUSCLE_Message_create(timestamp)

Update Python examples to use `Message(t, data=...)` instead of
`Message(t, None, ...)`
---
 docs/source/examples/python/diffusion.py      |  2 +-
 .../examples/python/interact_coupling.py      |  4 ++--
 docs/source/examples/python/reaction.py       |  2 +-
 .../examples/python/reaction_diffusion.py     |  2 +-
 .../examples/python/reaction_diffusion_qmc.py |  6 ++---
 docs/source/fortran_api.rst                   |  7 ++++++
 docs/source/tutorial.rst                      |  9 ++++----
 docs/source/uncertainty_quantification.rst    |  4 ++--
 integration_test/test_all.py                  |  2 +-
 integration_test/test_duplication_mapper.py   |  2 +-
 integration_test/test_multicast.py            |  2 +-
 integration_test/test_parameter_overlays.py   |  5 ++--
 .../test_snapshot_complex_coupling.py         | 16 ++++++-------
 integration_test/test_snapshot_dispatch.py    | 12 +++++-----
 integration_test/test_snapshot_interact.py    |  4 ++--
 integration_test/test_snapshot_macro_micro.py | 16 ++++++-------
 .../cpp/build/libmuscle/libmuscle.version     |  1 +
 .../cpp/build/libmuscle/libmuscle_mpi.version |  1 +
 .../bindings/libmuscle_fortran_c.cpp          |  5 ++++
 .../bindings/libmuscle_mpi_fortran_c.cpp      |  5 ++++
 libmuscle/cpp/src/libmuscle/message.cpp       |  8 +++++++
 libmuscle/cpp/src/libmuscle/message.hpp       |  6 +++++
 .../tests/mocks/mock_communicator.cpp         |  4 ++--
 libmuscle/fortran/src/libmuscle/libmuscle.f90 | 23 +++++++++++++++++++
 .../fortran/src/libmuscle/libmuscle_mpi.f90   | 23 +++++++++++++++++++
 libmuscle/python/libmuscle/communicator.py    |  4 ++--
 .../python/libmuscle/test/test_snapshot.py    |  2 +-
 .../libmuscle/test/test_snapshot_manager.py   |  4 ++--
 scripts/make_libmuscle_api.py                 |  3 ++-
 29 files changed, 131 insertions(+), 53 deletions(-)

diff --git a/docs/source/examples/python/diffusion.py b/docs/source/examples/python/diffusion.py
index e4ad4726..994a5e1a 100644
--- a/docs/source/examples/python/diffusion.py
+++ b/docs/source/examples/python/diffusion.py
@@ -75,7 +75,7 @@ def diffusion() -> None:
             t_cur += dt
 
         # O_F
-        final_state_msg = Message(t_cur, None, Grid(U, ['x']))
+        final_state_msg = Message(t_cur, data=Grid(U, ['x']))
         instance.send('final_state_out', final_state_msg)
 
         if 'DONTPLOT' not in os.environ and 'SLURM_NODENAME' not in os.environ:
diff --git a/docs/source/examples/python/interact_coupling.py b/docs/source/examples/python/interact_coupling.py
index 3df5e11e..8014dad4 100644
--- a/docs/source/examples/python/interact_coupling.py
+++ b/docs/source/examples/python/interact_coupling.py
@@ -305,11 +305,11 @@ def checkpointing_temporal_coupler() -> None:
             t_cur = min(a.rcvd, b.rcvd)
             if instance.should_save_snapshot(t_cur):
                 instance.save_snapshot(Message(
-                        t_cur, None, {'a': a.get_state(), 'b': b.get_state()}))
+                        t_cur, data={'a': a.get_state(), 'b': b.get_state()}))
 
         t_cur = min(a.rcvd, b.rcvd)
         if instance.should_save_final_snapshot():
-            instance.save_final_snapshot(Message(t_cur, None, None))
+            instance.save_final_snapshot(Message(t_cur))
 
 
 if __name__ == '__main__':
diff --git a/docs/source/examples/python/reaction.py b/docs/source/examples/python/reaction.py
index aad03ba0..67e3d92e 100644
--- a/docs/source/examples/python/reaction.py
+++ b/docs/source/examples/python/reaction.py
@@ -30,7 +30,7 @@ def reaction() -> None:
             t_cur += dt
 
         # O_F
-        instance.send('final_state', Message(t_cur, None, Grid(U, ['x'])))
+        instance.send('final_state', Message(t_cur, data=Grid(U, ['x'])))
 
 
 if __name__ == '__main__':
diff --git a/docs/source/examples/python/reaction_diffusion.py b/docs/source/examples/python/reaction_diffusion.py
index 06001106..75958d5f 100644
--- a/docs/source/examples/python/reaction_diffusion.py
+++ b/docs/source/examples/python/reaction_diffusion.py
@@ -34,7 +34,7 @@ def reaction() -> None:
             t_cur += dt
 
         # O_F
-        instance.send('final_state', Message(t_cur, None, Grid(U, ['x'])))
+        instance.send('final_state', Message(t_cur, data=Grid(U, ['x'])))
 
 
 def laplacian(Z: np.array, dx: float) -> np.array:
diff --git a/docs/source/examples/python/reaction_diffusion_qmc.py b/docs/source/examples/python/reaction_diffusion_qmc.py
index 1f203de1..f96de4b1 100644
--- a/docs/source/examples/python/reaction_diffusion_qmc.py
+++ b/docs/source/examples/python/reaction_diffusion_qmc.py
@@ -35,7 +35,7 @@ def reaction() -> None:
             t_cur += dt
 
         # O_F
-        instance.send('final_state', Message(t_cur, None, Grid(U, ['x'])))
+        instance.send('final_state', Message(t_cur, data=Grid(U, ['x'])))
 
 
 def laplacian(Z: np.array, dx: float) -> np.array:
@@ -105,7 +105,7 @@ def diffusion() -> None:
             t_cur += dt
 
         # O_F
-        instance.send('final_state_out', Message(t_cur, None, Grid(U, ['x'])))
+        instance.send('final_state_out', Message(t_cur, data=Grid(U, ['x'])))
 
 
 def load_balancer() -> None:
@@ -200,7 +200,7 @@ def qmc_driver() -> None:
             uq_parameters = Settings({
                 'd': ds[sample],
                 'k': ks[sample]})
-            msg = Message(0.0, None, uq_parameters)
+            msg = Message(0.0, data=uq_parameters)
             instance.send('parameters_out', msg, sample)
 
         # S
diff --git a/docs/source/fortran_api.rst b/docs/source/fortran_api.rst
index 0d981576..895a49bd 100644
--- a/docs/source/fortran_api.rst
+++ b/docs/source/fortran_api.rst
@@ -1257,6 +1257,13 @@ LIBMUSCLE_Message
     will be overlaid onto the receiving model's settings; this is normally only
     used by special simulation components.
 
+.. f:function:: LIBMUSCLE_Message_create(timestamp)
+
+    Create a new Message object.
+
+    :p LIBMUSCLE_real8 timestamp: The simulated time to which the data in this
+            message applies.
+
 .. f:function:: LIBMUSCLE_Message_create(timestamp, data)
 
     Create a new Message object.
diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst
index ab260230..3e10895b 100644
--- a/docs/source/tutorial.rst
+++ b/docs/source/tutorial.rst
@@ -228,7 +228,7 @@ Sending the final result
 .. code-block:: python
 
   # O_F
-  instance.send('final_state', Message(t_cur, None, Grid(U, ['x'])))
+  instance.send('final_state', Message(t_cur, data=Grid(U, ['x'])))
 
 
 After the update loop is done, the model has arrived at its final state. We
@@ -240,9 +240,7 @@ Execution Loop, so that is where we declared this port to live in our
 
 To send a message, we specify the port on which to send (which must match the
 declaration by name and operator), and a Message object containing the current
-simulation time and the current state, converted to a Grid. The optional second
-parameter is a second timestamp, which will be discussed below, and is set to
-``None`` here.
+simulation time and the current state, converted to a Grid.
 
 MUSCLE3 uses `MessagePack <https://msgpack.org>`_ to encode messages between
 models. MessagePack is a binary encoding format which can be thought of as a
@@ -334,7 +332,8 @@ So, to make your submodel more generically usable, it's good to set the second
 timestamp. But perhaps you're trying to connect an existing codebase that uses
 varying timestep sizes, and it's not easy to get it to tell you how big the
 next timestep will be. In that case, if you're not doing time scale overlap,
-just put ``None`` there and move on to the next problem, it'll work just fine.
+just create your message via ``Message(timestamp, data=...)`` or put ``None``
+as ``next_timestamp`` and move on to the next problem, it'll work just fine.
 
 Receiving messages with a default
 ---------------------------------
diff --git a/docs/source/uncertainty_quantification.rst b/docs/source/uncertainty_quantification.rst
index 1a25a98b..36dff972 100644
--- a/docs/source/uncertainty_quantification.rst
+++ b/docs/source/uncertainty_quantification.rst
@@ -174,7 +174,7 @@ this case, the port will be resizable and it will work as intended.
       uq_parameters = Settings({
           'd': ds[sample],
           'k': ks[sample]})
-      msg = Message(0.0, None, uq_parameters)
+      msg = Message(0.0, data=uq_parameters)
       instance.send('parameters_out', msg, sample)
 
 Since we only run our O_I and S once, we do not have a state update loop that
@@ -189,7 +189,7 @@ in the central configuration.
 
 Next, we create a :class:`libmuscle.Message` object to send. Since our models
 will start at time 0, we'll set that as the timestamp, and since we're only
-running them once each, the next timestamp is ``None``. For the data, we send
+running them once each, we omit the next timestamp. For the data, we send
 the ``Settings`` object. (MUSCLE3 contains special support for sending
 ``Settings`` objects, since being objects they're not normally
 MessagePack-serialisable.)
diff --git a/integration_test/test_all.py b/integration_test/test_all.py
index 8764f3a4..a9c578e0 100644
--- a/integration_test/test_all.py
+++ b/integration_test/test_all.py
@@ -56,7 +56,7 @@ def micro():
                 'int': 42,
                 'float': 3.1416,
                 'grid': Grid(np.array([[12.0, 34.0, 56.0], [1.0, 2.0, 3.0]]))}
-        instance.send('out', Message(0.1, None, result))
+        instance.send('out', Message(0.1, data=result))
 
 
 def test_all(log_file_in_tmpdir):
diff --git a/integration_test/test_duplication_mapper.py b/integration_test/test_duplication_mapper.py
index 9717ad36..7f65d6ad 100644
--- a/integration_test/test_duplication_mapper.py
+++ b/integration_test/test_duplication_mapper.py
@@ -14,7 +14,7 @@ def duplication_mapper():
         # o_f
         out_ports = instance.list_ports()[Operator.O_F]
 
-        message = Message(0.0, None, 'testing')
+        message = Message(0.0, data='testing')
         for out_port in out_ports:
             instance.send(out_port, message)
 
diff --git a/integration_test/test_multicast.py b/integration_test/test_multicast.py
index 8dedee17..587f722e 100644
--- a/integration_test/test_multicast.py
+++ b/integration_test/test_multicast.py
@@ -10,7 +10,7 @@ def multicaster():
 
     while instance.reuse_instance():
         # o_f
-        message = Message(0.0, None, 'testing')
+        message = Message(0.0, data='testing')
         instance.send('out', message)
 
 
diff --git a/integration_test/test_parameter_overlays.py b/integration_test/test_parameter_overlays.py
index cf091193..1e342e5d 100644
--- a/integration_test/test_parameter_overlays.py
+++ b/integration_test/test_parameter_overlays.py
@@ -22,8 +22,7 @@ def qmc():
         length = instance.get_port_length('settings_out')
         assert length == 10
         for slot in range(length):
-            instance.send('settings_out',
-                          Message(0.0, None, settings0), slot)
+            instance.send('settings_out', Message(0.0, data=settings0), slot)
 
 
 def macro():
@@ -87,7 +86,7 @@ def micro():
         #     instance.receive_with_settings('in')
 
         # o_f
-        instance.send('out', Message(0.1, None, 'testing back'))
+        instance.send('out', Message(0.1, data='testing back'))
 
 
 def test_settings_overlays(log_file_in_tmpdir):
diff --git a/integration_test/test_snapshot_complex_coupling.py b/integration_test/test_snapshot_complex_coupling.py
index dad2ee34..ba612075 100644
--- a/integration_test/test_snapshot_complex_coupling.py
+++ b/integration_test/test_snapshot_complex_coupling.py
@@ -23,7 +23,7 @@ def cache_component(max_channels=2):
     cache_t = float('-inf')
     cache_data = []
     max_cache_age = None
-    nil_msg = Message(0.0, None, None)
+    nil_msg = Message(0.0)
 
     while instance.reuse_instance():
         if instance.resuming():
@@ -41,17 +41,17 @@ def cache_component(max_channels=2):
         if cur_t - cache_t >= max_cache_age:
             # Cached value is no longer valid, run submodel for updated data
             for msg, port in zip(msgs, ports[Operator.O_I]):
-                instance.send(port, Message(cur_t, None, msg.data))
+                instance.send(port, Message(cur_t, data=msg.data))
             cache_data = [instance.receive(port, default=nil_msg).data
                           for port in ports[Operator.S]]
             cache_t = cur_t
             max_cache_age = random.uniform(*cache_valid_range)
 
         for data, port in zip(cache_data, ports[Operator.O_F]):
-            instance.send(port, Message(cur_t, None, data))
+            instance.send(port, Message(cur_t, data=data))
 
         if instance.should_save_final_snapshot():
-            instance.save_final_snapshot(Message(cur_t, None, []))
+            instance.save_final_snapshot(Message(cur_t, data=[]))
 
 
 def echo_component(max_channels=2):
@@ -87,7 +87,7 @@ def main_component():
             i = 0
 
         while time.monotonic() < monotonic_end:
-            instance.send('state_out', Message(t_cur, None, i))
+            instance.send('state_out', Message(t_cur, data=i))
             for port in ('Ai', 'Bi', 'Ci', 'Di'):
                 instance.receive(port)
 
@@ -97,12 +97,12 @@ def main_component():
 
             if instance.should_save_snapshot(t_cur):
                 instance.save_snapshot(Message(
-                        t_cur, None, [i, monotonic_end - time.monotonic()]))
+                        t_cur, data=[i, monotonic_end - time.monotonic()]))
 
-        instance.send('o_f', Message(t_cur, None, i))
+        instance.send('o_f', Message(t_cur, data=i))
 
         if instance.should_save_final_snapshot():
-            instance.save_final_snapshot(Message(t_cur, None, [i, 0]))
+            instance.save_final_snapshot(Message(t_cur, data=[i, 0]))
 
 
 @pytest.fixture
diff --git a/integration_test/test_snapshot_dispatch.py b/integration_test/test_snapshot_dispatch.py
index 7102a43c..dc1b1aec 100644
--- a/integration_test/test_snapshot_dispatch.py
+++ b/integration_test/test_snapshot_dispatch.py
@@ -25,7 +25,7 @@ def component():
             i, t_stop = msg.data
 
         if instance.should_init():
-            msg = instance.receive('f_i', default=Message(0, None, 0))
+            msg = instance.receive('f_i', default=Message(0, data=0))
             t_cur = msg.timestamp
             i = msg.data
             t_stop = t_cur + t_max
@@ -35,12 +35,12 @@ def component():
             t_cur += dt
 
             if instance.should_save_snapshot(t_cur):
-                instance.save_snapshot(Message(t_cur, None, [i, t_stop]))
+                instance.save_snapshot(Message(t_cur, data=[i, t_stop]))
 
-        instance.send('o_f', Message(t_cur, None, i))
+        instance.send('o_f', Message(t_cur, data=i))
 
         if instance.should_save_final_snapshot():
-            instance.save_final_snapshot(Message(t_cur, None, [i, t_stop]))
+            instance.save_final_snapshot(Message(t_cur, data=[i, t_stop]))
 
 
 def stateless_component():
@@ -53,7 +53,7 @@ def stateless_component():
         dt = instance.get_setting('dt', 'float')
         t_max = instance.get_setting('t_max', 'float')
 
-        msg = instance.receive('f_i', default=Message(0, None, 0))
+        msg = instance.receive('f_i', default=Message(0, data=0))
         t_cur = msg.timestamp
         i = msg.data
         t_stop = t_cur + t_max
@@ -62,7 +62,7 @@ def stateless_component():
             # faux time-integration for testing snapshots
             t_cur += dt
 
-        instance.send('o_f', Message(t_cur, None, i))
+        instance.send('o_f', Message(t_cur, data=i))
 
 
 @pytest.fixture
diff --git a/integration_test/test_snapshot_interact.py b/integration_test/test_snapshot_interact.py
index 5492f9e2..4cb32b9a 100644
--- a/integration_test/test_snapshot_interact.py
+++ b/integration_test/test_snapshot_interact.py
@@ -58,10 +58,10 @@ def component():
             i += 1
 
             if instance.should_save_snapshot(t_cur):
-                instance.save_snapshot(Message(t_cur, None, [i, t_stop]))
+                instance.save_snapshot(Message(t_cur, data=[i, t_stop]))
 
         if instance.should_save_final_snapshot():
-            instance.save_final_snapshot(Message(t_cur, None, [i, t_stop]))
+            instance.save_final_snapshot(Message(t_cur, data=[i, t_stop]))
 
 
 def test_snapshot_interact_lockstep(tmp_path):
diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py
index 885ac704..ee2f0011 100644
--- a/integration_test/test_snapshot_macro_micro.py
+++ b/integration_test/test_snapshot_macro_micro.py
@@ -43,10 +43,10 @@ def macro():
             t_cur += dt
 
             if instance.should_save_snapshot(t_cur):
-                instance.save_snapshot(Message(t_cur, None, i))
+                instance.save_snapshot(Message(t_cur, data=i))
 
         if instance.should_save_final_snapshot():
-            instance.save_final_snapshot(Message(t_cur, None, i))
+            instance.save_final_snapshot(Message(t_cur, data=i))
 
 
 def macro_vector():
@@ -84,10 +84,10 @@ def macro_vector():
             t_cur += dt
 
             if instance.should_save_snapshot(t_cur):
-                instance.save_snapshot(Message(t_cur, None, i))
+                instance.save_snapshot(Message(t_cur, data=i))
 
         if instance.should_save_final_snapshot():
-            instance.save_final_snapshot(Message(t_cur, None, i))
+            instance.save_final_snapshot(Message(t_cur, data=i))
 
 
 def micro():
@@ -115,12 +115,12 @@ def micro():
             t_cur += dt
 
             if instance.should_save_snapshot(t_cur):
-                instance.save_snapshot(Message(t_cur, None, [i, t_stop]))
+                instance.save_snapshot(Message(t_cur, data=[i, t_stop]))
 
-        instance.send('o_f', Message(t_cur, None, i))
+        instance.send('o_f', Message(t_cur, data=i))
 
         if instance.should_save_final_snapshot():
-            instance.save_final_snapshot(Message(t_cur, None, [i, t_stop]))
+            instance.save_final_snapshot(Message(t_cur, data=[i, t_stop]))
 
 
 def stateless_micro():
@@ -142,7 +142,7 @@ def stateless_micro():
             # faux time-integration for testing snapshots
             t_cur += dt
 
-        instance.send('o_f', Message(t_cur, None, i))
+        instance.send('o_f', Message(t_cur, data=i))
 
 
 def data_transformer():
diff --git a/libmuscle/cpp/build/libmuscle/libmuscle.version b/libmuscle/cpp/build/libmuscle/libmuscle.version
index 4dc0e9b0..6a5400c4 100644
--- a/libmuscle/cpp/build/libmuscle/libmuscle.version
+++ b/libmuscle/cpp/build/libmuscle/libmuscle.version
@@ -303,6 +303,7 @@
         LIBMUSCLE_PortsDescription_add_;
         LIBMUSCLE_PortsDescription_num_ports_;
         LIBMUSCLE_PortsDescription_get_;
+        LIBMUSCLE_Message_create_t_;
         LIBMUSCLE_Message_create_td_;
         LIBMUSCLE_Message_create_tnd_;
         LIBMUSCLE_Message_create_tds_;
diff --git a/libmuscle/cpp/build/libmuscle/libmuscle_mpi.version b/libmuscle/cpp/build/libmuscle/libmuscle_mpi.version
index dac26ff1..a6cb4915 100644
--- a/libmuscle/cpp/build/libmuscle/libmuscle_mpi.version
+++ b/libmuscle/cpp/build/libmuscle/libmuscle_mpi.version
@@ -303,6 +303,7 @@
         LIBMUSCLE_PortsDescription_add_;
         LIBMUSCLE_PortsDescription_num_ports_;
         LIBMUSCLE_PortsDescription_get_;
+        LIBMUSCLE_Message_create_t_;
         LIBMUSCLE_Message_create_td_;
         LIBMUSCLE_Message_create_tnd_;
         LIBMUSCLE_Message_create_tds_;
diff --git a/libmuscle/cpp/src/libmuscle/bindings/libmuscle_fortran_c.cpp b/libmuscle/cpp/src/libmuscle/bindings/libmuscle_fortran_c.cpp
index 418a9c89..d20562d5 100644
--- a/libmuscle/cpp/src/libmuscle/bindings/libmuscle_fortran_c.cpp
+++ b/libmuscle/cpp/src/libmuscle/bindings/libmuscle_fortran_c.cpp
@@ -3549,6 +3549,11 @@ void LIBMUSCLE_PortsDescription_get_(std::intptr_t self, int op, std::size_t i,
     }
 }
 
+std::intptr_t LIBMUSCLE_Message_create_t_(double timestamp) {
+    Message * result = new Message(timestamp);
+    return reinterpret_cast<std::intptr_t>(result);
+}
+
 std::intptr_t LIBMUSCLE_Message_create_td_(double timestamp, std::intptr_t data) {
     Data * data_p = reinterpret_cast<Data *>(data);
     Message * result = new Message(timestamp, *data_p);
diff --git a/libmuscle/cpp/src/libmuscle/bindings/libmuscle_mpi_fortran_c.cpp b/libmuscle/cpp/src/libmuscle/bindings/libmuscle_mpi_fortran_c.cpp
index 877bb2a6..fb1ba471 100644
--- a/libmuscle/cpp/src/libmuscle/bindings/libmuscle_mpi_fortran_c.cpp
+++ b/libmuscle/cpp/src/libmuscle/bindings/libmuscle_mpi_fortran_c.cpp
@@ -3549,6 +3549,11 @@ void LIBMUSCLE_PortsDescription_get_(std::intptr_t self, int op, std::size_t i,
     }
 }
 
+std::intptr_t LIBMUSCLE_Message_create_t_(double timestamp) {
+    Message * result = new Message(timestamp);
+    return reinterpret_cast<std::intptr_t>(result);
+}
+
 std::intptr_t LIBMUSCLE_Message_create_td_(double timestamp, std::intptr_t data) {
     Data * data_p = reinterpret_cast<Data *>(data);
     Message * result = new Message(timestamp, *data_p);
diff --git a/libmuscle/cpp/src/libmuscle/message.cpp b/libmuscle/cpp/src/libmuscle/message.cpp
index 0809efd3..e5a648f9 100644
--- a/libmuscle/cpp/src/libmuscle/message.cpp
+++ b/libmuscle/cpp/src/libmuscle/message.cpp
@@ -6,6 +6,14 @@ using ymmsl::Settings;
 
 namespace libmuscle { namespace impl {
 
+Message::Message(
+        double timestamp)
+    : timestamp_(timestamp)
+    , next_timestamp_()
+    , data_()
+    , settings_()
+{}
+
 Message::Message(
         double timestamp,
         DataConstRef const & data)
diff --git a/libmuscle/cpp/src/libmuscle/message.hpp b/libmuscle/cpp/src/libmuscle/message.hpp
index d50b1ad8..ab2ec186 100644
--- a/libmuscle/cpp/src/libmuscle/message.hpp
+++ b/libmuscle/cpp/src/libmuscle/message.hpp
@@ -17,6 +17,12 @@ namespace libmuscle { namespace impl {
 // out on the wire. See libmuscle::mcp::Message for that.
 class Message {
     public:
+        /** Create an empty Message.
+         *
+         * @param timestamp Simulation time for which this data is valid.
+         */
+        Message(double timestamp);
+
         /** Create a Message.
          *
          * @param timestamp Simulation time for which this data is valid.
diff --git a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_communicator.cpp b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_communicator.cpp
index 3a923511..0f01a3a8 100644
--- a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_communicator.cpp
+++ b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_communicator.cpp
@@ -107,7 +107,7 @@ void MockCommunicator::reset() {
     next_received_message.clear();
     list_ports_return_value.clear();
     last_sent_port = "";
-    last_sent_message = Message(0.0, Data());
+    last_sent_message = Message(0.0);
     last_sent_slot = {};
 }
 
@@ -126,7 +126,7 @@ PortsDescription MockCommunicator::list_ports_return_value;
 
 std::string MockCommunicator::last_sent_port;
 
-Message MockCommunicator::last_sent_message(0.0, Data());
+Message MockCommunicator::last_sent_message(0.0);
 
 Optional<int> MockCommunicator::last_sent_slot;
 
diff --git a/libmuscle/fortran/src/libmuscle/libmuscle.f90 b/libmuscle/fortran/src/libmuscle/libmuscle.f90
index edbf529d..7c19c494 100644
--- a/libmuscle/fortran/src/libmuscle/libmuscle.f90
+++ b/libmuscle/fortran/src/libmuscle/libmuscle.f90
@@ -402,6 +402,7 @@ module libmuscle
     end type LIBMUSCLE_Message
     public :: LIBMUSCLE_Message
 
+    public :: LIBMUSCLE_Message_create_t
     public :: LIBMUSCLE_Message_create_td
     public :: LIBMUSCLE_Message_create_tnd
     public :: LIBMUSCLE_Message_create_tds
@@ -2784,6 +2785,13 @@ subroutine LIBMUSCLE_PortsDescription_get_( &
             integer (c_size_t), intent(out) :: err_msg_len
         end subroutine LIBMUSCLE_PortsDescription_get_
 
+        integer (c_intptr_t) function LIBMUSCLE_Message_create_t_(timestamp) &
+                bind(C, name="LIBMUSCLE_Message_create_t_")
+
+            use iso_c_binding
+            real (c_double), value, intent(in) :: timestamp
+        end function LIBMUSCLE_Message_create_t_
+
         integer (c_intptr_t) function LIBMUSCLE_Message_create_td_( &
                 timestamp, &
                 data) &
@@ -3798,6 +3806,7 @@ end function LIBMUSCLE_Instance_receive_with_settings_psd_
 
     interface LIBMUSCLE_Message_create
         module procedure &
+            LIBMUSCLE_Message_create_t, &
             LIBMUSCLE_Message_create_td, &
             LIBMUSCLE_Message_create_tnd, &
             LIBMUSCLE_Message_create_tds, &
@@ -16172,6 +16181,20 @@ function LIBMUSCLE_PortsDescription_get( &
         end do
     end function LIBMUSCLE_PortsDescription_get
 
+    function LIBMUSCLE_Message_create_t( &
+            timestamp)
+        implicit none
+        real (LIBMUSCLE_real8), intent(in) :: timestamp
+        type(LIBMUSCLE_Message) :: LIBMUSCLE_Message_create_t
+
+        integer (c_intptr_t) :: ret_val
+
+        ret_val = LIBMUSCLE_Message_create_t_( &
+            timestamp)
+
+        LIBMUSCLE_Message_create_t%ptr = ret_val
+    end function LIBMUSCLE_Message_create_t
+
     function LIBMUSCLE_Message_create_td( &
             timestamp, &
             data)
diff --git a/libmuscle/fortran/src/libmuscle/libmuscle_mpi.f90 b/libmuscle/fortran/src/libmuscle/libmuscle_mpi.f90
index 5f56e3ab..76a9940f 100644
--- a/libmuscle/fortran/src/libmuscle/libmuscle_mpi.f90
+++ b/libmuscle/fortran/src/libmuscle/libmuscle_mpi.f90
@@ -403,6 +403,7 @@ module libmuscle_mpi
     end type LIBMUSCLE_Message
     public :: LIBMUSCLE_Message
 
+    public :: LIBMUSCLE_Message_create_t
     public :: LIBMUSCLE_Message_create_td
     public :: LIBMUSCLE_Message_create_tnd
     public :: LIBMUSCLE_Message_create_tds
@@ -2789,6 +2790,13 @@ subroutine LIBMUSCLE_PortsDescription_get_( &
             integer (c_size_t), intent(out) :: err_msg_len
         end subroutine LIBMUSCLE_PortsDescription_get_
 
+        integer (c_intptr_t) function LIBMUSCLE_Message_create_t_(timestamp) &
+                bind(C, name="LIBMUSCLE_Message_create_t_")
+
+            use iso_c_binding
+            real (c_double), value, intent(in) :: timestamp
+        end function LIBMUSCLE_Message_create_t_
+
         integer (c_intptr_t) function LIBMUSCLE_Message_create_td_( &
                 timestamp, &
                 data) &
@@ -3812,6 +3820,7 @@ end function LIBMUSCLE_Instance_receive_with_settings_psd_
 
     interface LIBMUSCLE_Message_create
         module procedure &
+            LIBMUSCLE_Message_create_t, &
             LIBMUSCLE_Message_create_td, &
             LIBMUSCLE_Message_create_tnd, &
             LIBMUSCLE_Message_create_tds, &
@@ -16190,6 +16199,20 @@ function LIBMUSCLE_PortsDescription_get( &
         end do
     end function LIBMUSCLE_PortsDescription_get
 
+    function LIBMUSCLE_Message_create_t( &
+            timestamp)
+        implicit none
+        real (LIBMUSCLE_real8), intent(in) :: timestamp
+        type(LIBMUSCLE_Message) :: LIBMUSCLE_Message_create_t
+
+        integer (c_intptr_t) :: ret_val
+
+        ret_val = LIBMUSCLE_Message_create_t_( &
+            timestamp)
+
+        LIBMUSCLE_Message_create_t%ptr = ret_val
+    end function LIBMUSCLE_Message_create_t
+
     function LIBMUSCLE_Message_create_td( &
             timestamp, &
             data)
diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py
index 69272f78..e8ea49d2 100644
--- a/libmuscle/python/libmuscle/communicator.py
+++ b/libmuscle/python/libmuscle/communicator.py
@@ -36,8 +36,8 @@ class Message:
     """
     # Note: This is for communication with the user, it's not what
     # actually goes out on the wire, see libmuscle.mcp.Message for that.
-    def __init__(self, timestamp: float, next_timestamp: Optional[float],
-                 data: MessageObject,
+    def __init__(self, timestamp: float, next_timestamp: Optional[float] = None,
+                 data: MessageObject = None,
                  settings: Optional[Settings] = None
                  ) -> None:
         """Create a Message.
diff --git a/libmuscle/python/libmuscle/test/test_snapshot.py b/libmuscle/python/libmuscle/test/test_snapshot.py
index f459a001..fd84d540 100644
--- a/libmuscle/python/libmuscle/test/test_snapshot.py
+++ b/libmuscle/python/libmuscle/test/test_snapshot.py
@@ -11,7 +11,7 @@ def snapshot() -> Snapshot:
     wallclock_time = 15.3
     port_message_counts = {'in': [1], 'out': [4], 'muscle_settings_in': [0]}
     is_final = True
-    message = Message(1.2, None, 'test_data')
+    message = Message(1.2, data='test_data')
     snapshot = MsgPackSnapshot(
             triggers, wallclock_time, port_message_counts, is_final, message)
     assert snapshot.triggers == triggers
diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
index 31423bb0..c53d5f8e 100644
--- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py
+++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py
@@ -33,7 +33,7 @@ def test_save_load_snapshot(tmp_path: Path) -> None:
     assert not snapshot_manager.resuming_from_final()
 
     snapshot_manager.save_snapshot(
-            Message(0.2, None, 'test data'), False, ['test'], 13.0)
+            Message(0.2, data='test data'), False, ['test'], 13.0)
 
     communicator.get_message_counts.assert_called_with()
     manager.submit_snapshot_metadata.assert_called()
@@ -63,7 +63,7 @@ def test_save_load_snapshot(tmp_path: Path) -> None:
     assert msg.data == 'test data'
 
     snapshot_manager2.save_snapshot(
-            Message(0.6, None, 'test data2'), True, ['test'], 42.2, 1.2)
+            Message(0.6, data='test data2'), True, ['test'], 42.2, 1.2)
 
     instance, metadata = manager.submit_snapshot_metadata.call_args[0]
     assert instance == instance_id
diff --git a/scripts/make_libmuscle_api.py b/scripts/make_libmuscle_api.py
index 0b083a2e..a096b262 100755
--- a/scripts/make_libmuscle_api.py
+++ b/scripts/make_libmuscle_api.py
@@ -755,6 +755,7 @@ def __copy__(self) -> 'Elements':
 
 
 message_desc = Class('Message', None, [
+    Constructor([Double('timestamp')], 'create_t'),
     Constructor([Double('timestamp'), Obj('Data', 'data')], 'create_td'),
     Constructor(
         [Double('timestamp'), Double('next_timestamp'), Obj('Data', 'data')],
@@ -767,7 +768,7 @@ def __copy__(self) -> 'Elements':
             Obj('Settings', 'settings')],
         'create_tnds'),
     OverloadSet('create', [
-        'create_td', 'create_tnd', 'create_tds', 'create_tnds']),
+        'create_t', 'create_td', 'create_tnd', 'create_tds', 'create_tnds']),
     Destructor(),
     MemFun(Double(), 'timestamp'),
     MemFun(Void(), 'set_timestamp', [Double('timestamp')]),

From 6135b9d42a52e278a42834ae34bdfee116d1ed11 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 13 Jan 2023 10:40:11 +0100
Subject: [PATCH 138/183] Replace yatiml references by MUSCLE3 references

Fixes #138
---
 CONTRIBUTING.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst
index b14102e9..2014113c 100644
--- a/CONTRIBUTING.rst
+++ b/CONTRIBUTING.rst
@@ -15,14 +15,14 @@ The sections below outline the steps in each case.
 You have a question
 *******************
 
-#. use the search functionality `here <https://github.com/yatiml/yatiml/issues>`_ to see if someone already filed the same issue;
+#. use the search functionality `here <https://github.com/multiscale/muscle3/issues>`_ to see if someone already filed the same issue;
 #. if your issue search did not yield any relevant results, make a new issue;
 #. apply the "Question" label; apply other labels when relevant.
 
 You think you may have found a bug
 **********************************
 
-#. use the search functionality `here <https://github.com/yatiml/yatiml/issues>`_ to see if someone already filed the same issue;
+#. use the search functionality `here <https://github.com/multiscale/muscle3/issues>`_ to see if someone already filed the same issue;
 #. if your issue search did not yield any relevant results, make a new issue, making sure to provide enough information to the rest of the community to understand the cause and context of the problem. Depending on the issue, you may want to include:
     - the `SHA hashcode <https://help.github.com/articles/autolinked-references-and-urls/#commit-shas>`_ of the commit that is causing your problem;
     - some identifying information (name and version number) for dependencies you're using;
@@ -35,10 +35,10 @@ You want to make some kind of change to the code base
 #. (**important**) announce your plan to the rest of the community *before you start working*. This announcement should be in the form of a (new) issue;
 #. (**important**) wait until some kind of consensus is reached about your idea being a good idea;
 #. if needed, fork the repository to your own Github profile and create your own feature branch off of the latest master commit. While working on your feature branch, make sure to stay up to date with the master branch by pulling in changes, possibly from the 'upstream' repository (follow the instructions `here <https://help.github.com/articles/configuring-a-remote-for-a-fork/>`_ and `here <https://help.github.com/articles/syncing-a-fork/>`_);
-#. make sure the existing tests still work by running ``python setup.py test``;
+#. make sure the existing tests still work by running ``make test`` and ``make test_examples``;
 #. add your own tests (if necessary);
 #. update or expand the documentation;
-#. `push <http://rogerdudler.github.io/git-guide/>`_ your feature branch to (your fork of) the YAtiML repository on GitHub;
+#. `push <http://rogerdudler.github.io/git-guide/>`_ your feature branch to (your fork of) the MUSCLE3 repository on GitHub;
 #. create the pull request, e.g. following the instructions `here <https://help.github.com/articles/creating-a-pull-request/>`_.
 
 In case you feel like you've made a valuable contribution, but you don't know how to write or run tests for it, or how to generate the documentation: don't let this discourage you from making the pull request; we can help you! Just go ahead and submit the pull request, but keep in mind that you might be asked to append additional commits to your pull request.

From 7fdebc2f9f8abdc940b1cbf425399214226682b3 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 13 Jan 2023 11:30:44 +0100
Subject: [PATCH 139/183] Mark Message(timestamp) constructor as explicit

---
 libmuscle/cpp/src/libmuscle/message.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libmuscle/cpp/src/libmuscle/message.hpp b/libmuscle/cpp/src/libmuscle/message.hpp
index ab2ec186..c73ad01e 100644
--- a/libmuscle/cpp/src/libmuscle/message.hpp
+++ b/libmuscle/cpp/src/libmuscle/message.hpp
@@ -21,7 +21,7 @@ class Message {
          *
          * @param timestamp Simulation time for which this data is valid.
          */
-        Message(double timestamp);
+        explicit Message(double timestamp);
 
         /** Create a Message.
          *

From f5fc06873db07f7e56994848dc8e19fdb32dafab Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 5 Dec 2022 11:34:58 +0100
Subject: [PATCH 140/183] Add Python checkpointing examples

---
 docs/source/examples/Makefile                 |   2 +
 docs/source/examples/python/Makefile          |   1 +
 .../python/checkpointing_diffusion.py         | 119 ++++++++++++++++++
 .../examples/python/checkpointing_reaction.py |  57 +++++++++
 docs/source/examples/rd_checkpoints.ymmsl     |  31 +++++
 .../examples/rd_implementations.ymmsl.in      |  10 ++
 6 files changed, 220 insertions(+)
 create mode 100644 docs/source/examples/python/checkpointing_diffusion.py
 create mode 100644 docs/source/examples/python/checkpointing_reaction.py
 create mode 100644 docs/source/examples/rd_checkpoints.ymmsl

diff --git a/docs/source/examples/Makefile b/docs/source/examples/Makefile
index 2c70fe8e..8507eb52 100644
--- a/docs/source/examples/Makefile
+++ b/docs/source/examples/Makefile
@@ -101,6 +101,7 @@ clean:
 	$(MAKE) -C fortran clean
 	$(MAKE) -C python clean
 	rm -f rd_implementations.ymmsl
+	rm -rf run_*/
 
 
 # Tests
@@ -108,6 +109,7 @@ clean:
 .PHONY: test_python
 test_python: base
 	. python/build/venv/bin/activate && DONTPLOT=1 muscle_manager --start-all rd_implementations.ymmsl rd_python.ymmsl rd_settings.ymmsl
+	. python/build/venv/bin/activate && DONTPLOT=1 muscle_manager --start-all rd_implementations.ymmsl rd_checkpoints.ymmsl rd_settings.ymmsl
 	make -C python test
 
 .PHONY: test_cpp
diff --git a/docs/source/examples/python/Makefile b/docs/source/examples/python/Makefile
index f87b4616..48f27607 100644
--- a/docs/source/examples/python/Makefile
+++ b/docs/source/examples/python/Makefile
@@ -11,3 +11,4 @@ test:
 .PHONY: clean
 clean:
 	$(MAKE) -C build clean
+	rm -f *.log
diff --git a/docs/source/examples/python/checkpointing_diffusion.py b/docs/source/examples/python/checkpointing_diffusion.py
new file mode 100644
index 00000000..067858ed
--- /dev/null
+++ b/docs/source/examples/python/checkpointing_diffusion.py
@@ -0,0 +1,119 @@
+import logging
+import os
+
+import numpy as np
+
+from libmuscle import Grid, Instance, Message
+from ymmsl import Operator
+
+
+def laplacian(Z: np.ndarray, dx: float) -> np.ndarray:
+    """Calculates the Laplacian of vector Z.
+
+    Args:
+        Z: A vector representing a series of samples along a line.
+        dx: The spacing between the samples.
+
+    Returns:
+        The second spatial derivative of Z.
+    """
+    Zleft = Z[:-2]
+    Zright = Z[2:]
+    Zcenter = Z[1:-1]
+    return (Zleft + Zright - 2. * Zcenter) / dx**2
+
+
+def diffusion() -> None:
+    """A simple diffusion model on a 1d grid.
+
+    The state of this model is a 1D grid of concentrations. It sends
+    out the state on each timestep on `state_out`, and can receive an
+    updated state on `state_in` at each state update.
+    """
+    logger = logging.getLogger()
+    instance = Instance({
+            Operator.O_I: ['state_out'],
+            Operator.S: ['state_in'],
+            Operator.O_F: ['final_state_out']})
+
+    while instance.reuse_instance():
+        # F_INIT
+        t_max = instance.get_setting('t_max', 'float')
+        dt = instance.get_setting('dt', 'float')
+        x_max = instance.get_setting('x_max', 'float')
+        dx = instance.get_setting('dx', 'float')
+        d = instance.get_setting('d', 'float')
+
+        if instance.resuming():
+            msg = instance.load_snapshot()
+            U = msg.data[0].array.copy()
+            Us = msg.data[1].array.copy()
+            t_cur = msg.timestamp
+
+        if instance.should_init():
+            U = np.zeros(int(round(x_max / dx))) + 1e-20
+            U[25] = 2.0
+            U[50] = 2.0
+            U[75] = 2.0
+            Us = U
+            t_cur = 0.0
+
+        while t_cur + dt <= t_max:
+            # O_I
+            t_next = t_cur + dt
+            if t_next + dt > t_max:
+                t_next = None
+            cur_state_msg = Message(t_cur, t_next, Grid(U, ['x']))
+            instance.send('state_out', cur_state_msg)
+
+            # S
+            msg = instance.receive('state_in', default=cur_state_msg)
+            if msg.timestamp > t_cur + dt:
+                logger.warning('Received a message from the future!')
+            np.copyto(U, msg.data.array)
+
+            dU = np.zeros_like(U)
+            dU[1:-1] = d * laplacian(U, dx) * dt
+            dU[0] = dU[1]
+            dU[-1] = dU[-2]
+
+            U += dU
+            Us = np.vstack((Us, U))
+            t_cur += dt
+
+            if instance.should_save_snapshot(t_cur):
+                msg = Message(t_cur, None, [Grid(U), Grid(Us)])
+                instance.save_snapshot(msg)
+
+        # O_F
+        final_state_msg = Message(t_cur, None, Grid(U, ['x']))
+        instance.send('final_state_out', final_state_msg)
+
+        if 'DONTPLOT' not in os.environ and 'SLURM_NODENAME' not in os.environ:
+            from matplotlib import pyplot as plt
+            plt.figure()
+            plt.imshow(
+                    np.log(Us + 1e-20),
+                    origin='upper',
+                    extent=[
+                        -0.5*dx, x_max - 0.5*dx,
+                        (t_max - 0.5*dt) * 1000.0, -0.5*dt * 1000.0],
+                    interpolation='none',
+                    aspect='auto'
+                    )
+            cbar = plt.colorbar()
+            cbar.set_label('log(Concentration)', rotation=270, labelpad=20)
+            plt.xlabel('x')
+            plt.ylabel('t (ms)')
+            plt.title('Concentration over time')
+            plt.show()
+
+        if instance.should_save_final_snapshot():
+            msg = Message(t_cur, None, [Grid(U), Grid(Us)])
+            instance.save_final_snapshot(msg)
+
+
+if __name__ == '__main__':
+    logging.basicConfig()
+    logging.getLogger().setLevel(logging.INFO)
+    diffusion()
diff --git a/docs/source/examples/python/checkpointing_reaction.py b/docs/source/examples/python/checkpointing_reaction.py
new file mode 100644
index 00000000..07204e64
--- /dev/null
+++ b/docs/source/examples/python/checkpointing_reaction.py
@@ -0,0 +1,57 @@
+import logging
+
+from libmuscle import Grid, Instance, Message
+from ymmsl import Operator
+
+
+def reaction() -> None:
+    """A simple exponential reaction model on a 1D grid.
+    """
+    instance = Instance({
+            Operator.F_INIT: ['initial_state'],     # list of float
+            Operator.O_F: ['final_state']})         # list of float
+
+    while instance.reuse_instance():
+        t_max = instance.get_setting('t_max', 'float')
+        dt = instance.get_setting('dt', 'float')
+        k = instance.get_setting('k', 'float')
+
+        if instance.resuming():
+            msg = instance.load_snapshot()
+            if msg.data is not None:
+                # A final snapshot does not have data in it, but that's fine: we
+                # will do the F_INIT step inside `should_init()` below.
+                U = msg.data[0].array.copy()
+                t_cur = msg.timestamp
+                t_stop = msg.data[1]
+
+        # F_INIT
+        if instance.should_init():
+            msg = instance.receive('initial_state')
+            U = msg.data.array.copy()
+            t_cur = msg.timestamp
+            t_stop = msg.timestamp + t_max
+
+        while t_cur + dt < t_stop:
+            # O_I
+
+            # S
+            U += k * U * dt
+            t_cur += dt
+
+            if instance.should_save_snapshot(t_cur):
+                instance.save_snapshot(Message(t_cur, None, [
+                        Grid(U, ['x']),
+                        t_stop]))
+
+        # O_F
+        instance.send('final_state', Message(t_cur, None, Grid(U, ['x'])))
+
+        if instance.should_save_final_snapshot():
+            instance.save_final_snapshot(Message(t_cur, None, None))
+
+
+if __name__ == '__main__':
+    logging.basicConfig()
+    logging.getLogger().setLevel(logging.INFO)
+    reaction()
diff --git a/docs/source/examples/rd_checkpoints.ymmsl b/docs/source/examples/rd_checkpoints.ymmsl
new file mode 100644
index 00000000..82dfd638
--- /dev/null
+++ b/docs/source/examples/rd_checkpoints.ymmsl
@@ -0,0 +1,31 @@
+ymmsl_version: v0.1
+
+model:
+  name: checkpointing_reaction_diffusion_python
+
+  components:
+    macro:
+      implementation: checkpointing_diffusion_python
+      ports:
+        o_i: state_out
+        s: state_in
+
+    micro:
+      implementation: checkpointing_reaction_python
+      ports:
+        f_init: initial_state
+        o_f: final_state
+
+  conduits:
+    macro.state_out: micro.initial_state
+    micro.final_state: macro.state_in
+
+resources:
+  macro:
+    threads: 1
+  micro:
+    threads: 1
+
+checkpoints:
+  simulation_time:
+  - every: 2.0e-05
diff --git a/docs/source/examples/rd_implementations.ymmsl.in b/docs/source/examples/rd_implementations.ymmsl.in
index 4f2b0c7b..92cc2b8d 100644
--- a/docs/source/examples/rd_implementations.ymmsl.in
+++ b/docs/source/examples/rd_implementations.ymmsl.in
@@ -62,3 +62,13 @@ implementations:
     env:
       +LD_LIBRARY_PATH: :MUSCLE3_HOME/lib
     executable: MUSCLE3_EXAMPLES/fortran/build/load_balancer
+
+  checkpointing_reaction_python:
+    virtual_env: MUSCLE3_EXAMPLES/python/build/venv
+    executable: python
+    args: MUSCLE3_EXAMPLES/python/checkpointing_reaction.py
+
+  checkpointing_diffusion_python:
+    virtual_env: MUSCLE3_EXAMPLES/python/build/venv
+    executable: python
+    args: MUSCLE3_EXAMPLES/python/checkpointing_diffusion.py

From 241707ff4d215bec29027772ce6105a8acafc6c4 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 5 Dec 2022 14:22:34 +0100
Subject: [PATCH 141/183] Fix --run-dir ignored when not using --start-all

---
 muscle3/muscle_manager.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/muscle3/muscle_manager.py b/muscle3/muscle_manager.py
index 02d30803..b0bed73a 100644
--- a/muscle3/muscle_manager.py
+++ b/muscle3/muscle_manager.py
@@ -1,4 +1,4 @@
-from datetime import datetime, timezone
+from datetime import datetime
 from pathlib import Path
 import sys
 from typing import Optional, Sequence
@@ -68,12 +68,15 @@ def manage_simulation(
     else:
         run_dir_path = Path(run_dir).resolve()
 
+    run_dir_obj = RunDir(run_dir_path)
     if start_all:
-        run_dir_obj = RunDir(run_dir_path)
         manager = Manager(configuration, run_dir_obj, log_level)
         manager.start_instances()
     else:
-        manager = Manager(configuration, None, log_level)
+        if run_dir is None:
+            manager = Manager(configuration, None, log_level)
+        else:
+            manager = Manager(configuration, run_dir_obj, log_level)
         print(manager.get_server_location())
 
     success = manager.wait()

From 352cffd2f74823e259cd01651671a8e78f964f25 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Mon, 5 Dec 2022 16:43:43 +0100
Subject: [PATCH 142/183] First draft of checkpointing user documentation.

---
 docs/source/checkpointing.rst             | 539 ++++++++++++++++++++++
 docs/source/examples/rd_checkpoints.ymmsl |   2 +
 docs/source/index.rst                     |   1 +
 3 files changed, 542 insertions(+)
 create mode 100644 docs/source/checkpointing.rst

diff --git a/docs/source/checkpointing.rst b/docs/source/checkpointing.rst
new file mode 100644
index 00000000..62f9952a
--- /dev/null
+++ b/docs/source/checkpointing.rst
@@ -0,0 +1,539 @@
+Simulation checkpoints
+======================
+
+When you execute a long-running simulation, it can be very helpful to store the
+state of a simulation at certain intervals. For example, your simulation running
+on a HPC cluster may crash, just before it's finished, due to insufficient
+memory available. Instead of restarting this simulation from scratch, you could
+restart it -- with an increased memory allocation -- from a checkpoint, which
+would save a lot of compute time!
+
+Checkpointing in distributed simulations is difficult. Fortunately, MUSCLE3
+comes with built-in checkpointing support. This page describes in detail how to
+use the MUSCLE3 checkpointing API, how to specify checkpoints in the workflow
+configuration and how to resume a workflow.
+
+In the :ref:`user tutorial`, you can read about the checkpointing concepts and
+how to use the API when running and resuming MUSCLE3 simulations. This is
+followed by a :ref:`developer tutorial`, which explains how to add checkpointing
+capabilities to your MUSCLE3 component. Finally, the :ref:`checkpointing
+deep-dive` describes in detail the (inner) working of checkpointing in MUSCLE3;
+though this level of detail is not required for general usage of the API.
+
+
+Glossary
+--------
+
+.. glossary::
+
+    Checkpoint
+        A checkpoint is a moment during the workflow where the user wants
+        to have the state of the whole workflow stored.
+
+    Snapshot
+        A snapshot is the stored state of an instance in the workflow.
+
+    Workflow snapshot
+        A workflow snapshot is a collection of :term:`snapshots<snapshot>` for
+        all instances in the workflow, which can be resumed from. This means
+        that the snapshots of every combination of :term:`peer instances` must
+        be :ref:`consistent <Snapshot consistency>`.
+
+    Peer instances
+        Two instances that are connected by a Conduit.
+
+
+User tutorial
+-------------
+
+
+Defining checkpoints
+````````````````````
+
+The first step for using checkpoints is to define checkpoints in your workflow.
+The checkpoint definitions are for your whole workflow, and you can specify them
+in yMMSL as in the following example:
+
+.. code-block:: yaml
+    :caption: Example checkpoint definition in yMMSL.
+
+    checkpoints:
+      at_end: true
+      simulation_time:
+      - every: 10
+        start: 0
+        stop: 100
+      - every: 20
+        start: 100
+      wallclock_time:
+      - every: 3600
+      - at:
+        - 300
+        - 600
+        - 1800
+
+Let's break this down: the first element in this example ``checkpoints``
+definition is ``at_end``. When this is set to ``true`` (as in the example), it
+means that every instance in the workflow will create a snapshot just before the
+workflow finishes. This set of snapshots can be used to resume a simulation near
+the end and, for example, let it run for a longer time. Some caveats apply,
+though, see :ref:`resuming from *at_end* snapshots` for full details.
+
+The other two items in the ``checkpoints`` definition are the time-based
+:ref:`simulation time<Simulation time checkpoints>` and
+:ref:`wallclock time<Wallclock time checkpoints>`. You can use two types of
+rules to set checkpoint moments for these:
+
+.. _at checkpoint rule:
+
+#. ``at`` rules define specific moments. The example rule above request a
+   checkpoint to be taken at 300, 600 and 1800 seconds after the start of the
+   simulation. You can define multiple times in one ``at`` rule, but you may
+   also add multiple ``at`` rules. The following definitions are all equivalent:
+
+   .. tabs::
+
+        .. tab:: Standard
+
+            .. code-block:: yaml
+
+                checkpoints:
+                  wallclock_time:
+                  - at:
+                    - 300
+                    - 600
+                    - 1800
+
+        .. tab:: Inline list
+
+            .. code-block:: yaml
+
+                checkpoints:
+                  wallclock_time:
+                  - at: [300, 600, 1800]
+
+        .. tab:: Multiple ``at`` rules
+
+            .. code-block:: yaml
+
+                checkpoints:
+                  wallclock_time:
+                  - at: 300
+                  - at: 600
+                  - at: 1800
+
+.. _every checkpoint rule:
+
+#. ``every`` rules define a recurring set of checkpoints. In the simplest form
+   you indicate the interval at which checkpoints should be taken -- every hour
+   in the ``wallclock_time`` example above. You may optionally indicate a
+   ``start`` or ``stop`` -- as in the ``simulation_time`` example above.
+
+   .. tabs::
+
+        .. tab:: Simple
+
+            .. code-block:: yaml
+                :caption: Without ``start`` and ``stop`` indicated, this rule creates a snapshot every hour of elapsed time.
+
+                checkpoints:
+                  wallclock_time:
+                    every: 3600
+
+        .. tab:: Start and stop
+
+            .. code-block:: yaml
+                :caption: This combination of rules define a checkpoint at ``t=0``, ``t=10``, ..., until ``t=100``. Afterwards it continues indefinitely every 20 time units (``t=120``, ``t=140``, ...).
+
+                checkpoints:
+                  simulation_time:
+                  - every: 10
+                    start: 0
+                    stop: 100
+                  - every: 20
+                    start: 100
+
+        .. tab:: Overlapping ranges
+
+            .. code-block:: yaml
+                :caption: Overlapping ranges work as well. This combination defines a checkpoint every unit of time (``t=0``, ``t=1``, ...), and additionally at ``t=0.25``, ``t=0.75``, ``t=1.25`` and ``t=1.75``.
+
+                checkpoints:
+                  simulation_time:
+                  - every: 1
+                  - every: 0.25
+                    start: 0
+                    stop: 2
+
+   .. note::
+
+        When ``stop`` is specified, the stop time is included when ``stop ==
+        start + n * every``, with ``n`` a positive whole number. However, this
+        might give surprising results due to the inaccuracies of floating point
+        computations. Compare for example:
+
+        .. code-block:: yaml
+            :caption: This specifies a checkpoint at 0, 1, 2, ..., 6 and 7.
+
+            checkpoints:
+              simulation_time:
+              - every: 1
+                start: 0
+                stop: 7
+
+        .. code-block:: yaml
+            :caption: However this only checkpoints at 0, 0.1, 0.2, ... 0.5 and 0.6!
+
+            checkpoints:
+              simulation_time:
+              - every: 0.1
+                start: 0
+                stop: 0.7
+
+        Why the difference? Well - compare in python:
+
+        .. code-block:: python
+
+            >>> 7 * 1.0
+            7.0
+            >>> 7 * 0.1
+            0.7000000000000001
+
+        Since ``0.7000000000000001`` is larger than ``0.7``, no checkpoint will
+        be generated for this time.
+
+.. seealso::
+
+    yMMSL documentation on :external+ymmsl:ref:`Checkpoints`
+
+    yMMSL API reference: :external:py:class:`ymmsl.Checkpoints`,
+    :external:py:class:`ymmsl.CheckpointAtRule`,
+    :external:py:class:`ymmsl.CheckpointRangeRule`
+
+
+Simulation time checkpoints
+'''''''''''''''''''''''''''
+
+Checkpoints defined in the ``simulation_time`` section are taken based on the
+time inside your simulation. It will only work correctly if all components in
+the simulation have a shared concept of time, which only increases during the
+simulation. This should be no problem for physics-based simulations, though it
+does require that the instances make correct use of the :ref:`timestamp in
+MUSCLE3 messages <message timestamps>`. When this requirement is fulfilled,
+checkpoints based on simulation time are the most reliable way to checkpoint
+your workflow.
+
+MUSCLE3 does not interpret or convert the units that you configure in the
+checkpoints. The units are the same as the components in the simulation use for
+the timestamps in the messages. Typically this will be in SI seconds, but
+components may deviate from this standard. MUSCLE3 assumes that all components
+in the workflow use the same time units in the interfaces to libmuscle.
+
+.. note::
+
+    MUSCLE3 does not assume anything about the start time of a simulation. Your
+    simulation time may start at any value, even negative! Therefore,
+    :ref:`checkpoint ranges <every checkpoint rule>` include 0 and negative
+    numbers when no ``start`` value is provided.
+
+    Because MUSCLE3 does not know what internal time your simulation starts on,
+    an ``every`` rule without a ``start`` value will always trigger a checkpoint
+    at the first possible moment in the simulation. You should supply a
+    ``start`` value if you do not want this to happen.
+
+
+Wallclock time checkpoints
+''''''''''''''''''''''''''
+
+Checkpoints defined in the ``wallclock_time`` section are taken based on the
+elapsed wallclock time of your simulation (also known as *elapsed real time*).
+Each component in the simulation will make a snapshot at the earliest possible
+moment after a checkpoint is passed.
+
+The checkpoint times in the configuration are interpreted as seconds since the
+initialization of ``muscle_manager``.
+
+.. warning::
+
+    Wallclock time checkpoint definitions are (currently) not a reliable way to
+    create :term:`workflow snapshots <workflow snapshot>`. While each instance
+    in the simulation will create a snapshot when requested, there is no
+    guarantee that all snapshots are :ref:`consistent <Snapshot consistency>`.
+
+    When a simulation has relatively simple coupling between components, i.e.
+    only one peer instance per :external:py:class:`~ymmsl.Operator`,
+    checkpointing based on wallclock time usually works fine.
+
+    However for co-simulation (the *interact* coupling type) and more complex
+    coupling, it is likely that not all checkpoints lead to a consistent
+    :term:`workflow snapshot`.
+
+
+Running a simulation with checkpoints
+`````````````````````````````````````
+
+Starting a simulation with checkpoints is no different than starting one
+without. You need to start the ``muscle_manager`` with the configuration yMMSL
+file (or files), as well as the individual components (or let ``muscle_manager``
+start them for you with the ``--start-all`` flag). The sole difference is that
+the yMMSL configuration must contain a :ref:`checkpoints section <Defining
+checkpoints>`.
+
+When ``muscle_manager`` is started with checkpoints configured, a couple of
+things change. First, **all** of the component implementations **must** support
+checkpointing: the simulation will stop with an error if this is not the case.
+The simulation may also stop with an error if there is an issue in the
+checkpointing implementation of any of the components.
+
+Second, all components are instructed to make snapshots according to the
+configured checkpoints. ``muscle_manager`` keeps track of all created snapshots
+during the simulation, looking for :term:`workflow snapshots <workflow
+snapshot>`. When a workflow snapshot is detected, ``muscle_manager`` writes a
+yMMSL file that can be used to :ref:`resume the simulation <Resuming a
+simulation>`.
+
+During the simulation, all of the created snapshots are stored on the file
+system. See below table for the directories where MUSCLE3 stores the files.
+Note: a run-directory is automatically created when using the ``--start-all``
+flag for ``muscle_manager``. You may also specify a custom run directory through
+the ``--run-dir DIRECTORY`` option. When you do not provide a run directory, the
+last column in below table indicates where snapshots are stored.
+
+.. list-table:: Directories where MUSCLE3 stores snapshot files.
+    :header-rows: 1
+
+    * - Snapshot type
+      - Run directory provided
+      - No run directory provided
+    * - Workflow
+      - ``run_dir/snapshots/``
+      - Working directory of ``muscle_manager``
+    * - Instance
+      - ``run_dir/instances/<instance>/snapshots/``,
+
+        with ``<instance>`` the name of the instance.
+      - Working directory of the instance
+
+.. note::
+
+    When running a :ref:`distributed simulation <distributed execution>` on
+    multiple compute nodes, MUSCLE3 assumes that the run directory is accessible
+    to all nodes (i.e. on a shared or distributed file system). This is usually
+    the case on HPC clusters.
+
+
+Example: running the reaction-diffusion model with checkpoints
+''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
+
+The reaction-diffusion example model from the :ref:`Tutorial with Python` also
+has a variant with checkpointing enabled. To run this yourself, navigate in a
+command line prompt to the ``docs/source/examples`` folder in the MUSCLE3 git
+repository. Then execute the following command:
+
+.. code-block:: bash
+
+    $ mkdir run_rd_example
+    $ muscle_manager --start-all --run-dir run_rd_example rd_implementations.ymmsl rd_checkpoints.ymmsl rd_settings.ymmsl
+
+.. note::
+
+    You may get an error ``File 'rd_implementations.ymmsl' does not exist.`` To
+    fix this, you need to build the examples in the MUSCLE3 source; in the root
+    of the git repository, execute:
+
+    .. code-block::
+
+        $ make test_examples
+
+Above command runs the ``muscle_manager`` and starts all components (the
+reaction model and the diffusion model). The ``rd_checkpoints.ymmsl`` file
+contains the checkpoint definitions used in this example:
+
+.. literalinclude:: examples/rd_checkpoints.ymmsl
+    :caption: ``docs/source/examples/rd_checkpoints.ymmsl, lines 31-33``
+    :lines: 31-33
+    :language: yaml
+
+MUSCLE3 will create the run directory ``run_rd_example`` for you. In it you'll
+find the instance snapshots in ``instances/macro/snapshots`` and
+``instances/micro/snapshots``. The workflow snapshots are stored in the
+``snapshots`` folder in the run directory.
+
+Resuming a simulation
+`````````````````````
+
+You can resume a simulation from a :term:`workflow snapshot` stored in a
+previous run of the simulation. This works by appending a workflow snapshot
+yMMSL file from a previous run to the regular yMMSL configuration. If you
+started your original simulation with::
+
+    $ muscle_manager --run-dir ./run1 configuration.ymmsl
+
+You can resume it from a snapshot of this run like so::
+
+    $ muscle_manager --run-dir ./run2 configuration.ymmsl ./run1/snapshots/snapshot_20221202_112840.ymmsl
+
+Here we choose a different run directory, and resume from the snapshot file
+``snapshot_20221202_112840.ymmsl`` that was produced by the first run. This file
+contains the information required to resume the workflow:
+
+-   It contains a ``description`` which allows you to inspect metadata of the
+    workflow snapshot. It indicates the trigger or triggers leading to this
+    snapshot, and some information of the state of each component in the
+    workflow. This data is for informational purposes only, and ignored by
+    ``muscle_manager``.
+-   It also contains the paths to the snapshots that each instance needs to
+    resume. Note that these snapshots must still exist on the same location. If
+    you move or delete them (or a parent directory), resuming your simulation
+    will fail with an error message::
+
+        Unable to load snapshot: <snapshot filename> is not a file. Please ensure this path exists and can be read.
+
+
+Example: resuming the reaction-diffusion model
+''''''''''''''''''''''''''''''''''''''''''''''
+
+To resume the reaction-diffusion model from a snapshot created in the
+:ref:`previous section <Example: running the reaction-diffusion model with
+checkpoints`, replace ``<date>`` and ``<time>`` in the following command to poin
+to the snapshot you want to resume from and execute it.
+
+.. code-block:: bash
+    :caption: Resume from an earlier snapshot. Replace ``<date>`` and ``<time>`` to point to an actual snapshot file.
+
+    $ mkdir run_rd_resume
+    $ muscle_manager --start-all --run-dir run_rd_resume rd_implementations.ymmsl rd_checkpoints.ymmsl rd_settings.ymmsl run_rd_example/snapshots/snapshot_<date>_<time>.ymmsl
+
+When the command completes you can see the output in the new working directory
+``run_rd_resume``.
+
+
+Making changes to your simulation
+'''''''''''''''''''''''''''''''''
+
+MUSCLE3 checkpointing is designed for resuming simulations as if they never
+stopped. This means that resuming is only supported for :ref:`consistent
+snapshots <Snapshot consistency>` and for simulation configurations that have
+not changed.
+
+MUSCLE3 does not support any changes to the model when resuming, such as adding
+or removing components, or changing conduits. Attempting this will likely lead
+to deadlocks or error messages.
+
+You are allowed to change the settings of your simulation when resuming.
+However, it depends on the implementation of your components if and when changed
+settings take effect. Please ask the developers of your simulation components
+for this information.
+
+
+Resuming from *at_end* snapshots
+''''''''''''''''''''''''''''''''
+
+.. warning::
+    Resuming from only ``at_end`` snapshot will immediately complete.
+    TODO: Need to think about this still.
+
+
+Snapshot consistency
+````````````````````
+
+MUSCLE3 checkpointing was designed for consistency: no messages between the
+components must be lost when restarting. When we fulfill this criterium, a
+simulation can resume from a checkpoint as if it was never interrupted.
+
+During a simulation run, each component creates snapshots independent from all
+other components. For :ref:`simulation time checkpoints`, the MUSCLE3
+checkpointing algorithm is guaranteed to give consistent :term:`workflow
+snapshots <workflow snapshot>` when all components adhere to the
+:ref:`Multiscale Modeling and Simulation Framework (MMSF) <citation needed>`.
+
+:ref:`Wallclock time checkpoints` in the currrent implementation are less
+reliable: components may take snapshots while messages are still in transit.
+When that happens, it would lead to an inconsistent state and no workflow
+snapshots would be written by ``muscle_manager``.
+
+MUSCLE3 does not support combining inconsistent snapshots, so it is not possible
+to freely mix snapshots produced during a simulation. When resuming, MUSCLE3
+checks the consistency of all snapshots. The run will end with an error when an
+inconsistent state is detected::
+
+    Received message on <port> with unexpected message number <num>. Was
+    expecting <num>. Are you resuming from an inconsistent snapshot?
+
+When resuming from a :ref:`snapshot yMMSL <Running a simulation with
+checkpoints>` written by ``muscle_manager``, you should not encounter this
+error.
+
+
+Troubleshooting
+```````````````
+
+General troubleshooting strategy:
+
+1.  First try to find the root cause of the problem that your simulation ran
+    into. You can start by looking in the log file of the ``muscle_manager``,
+    located in ``<run directory>/muscle3_manager.log``. This log file may show
+    the error message or point you in the right direction.
+2.  If the ``muscle_manager`` log did not display an error, it may indicate
+    which component failed first. Have a look at the logs of that component to
+    figure out what went wrong. The output of an instance is usually found in
+    ``<run directory>/instances/<instance name>/``. Open ``stdout.txt`` and
+    ``stderr.txt`` to find out what went wrong.
+3.  If the ``muscle_manager`` logs did not point to a specific instance, you
+    should have a look at the log files of each instance (see point 2 for
+    instructions). Note that some instances may log ``Broken Pipe`` errors --
+    this usually happens when a peer component has crashed and it is typically
+    not the root cause of your simulation crash.
+
+Once you find the root cause of your problem, check below list for common issues
+and their resolutions. You may also have found a bug in MUSCLE3: please help us
+and your fellow MUSCLE3 users by :ref:`creating an issue <Make an issue>` on
+github.
+
+1. The simulation crashes when using checkpoints.
+    The first thing you should check is: does the simulation run error-free when
+    checkpoints are disabled? You can test this by commenting the checkpointing
+    section of your input ymmsl file(s).
+
+    If it runs error-free without checkpoints, have a look at the error message
+    in the log file generated by your run. MUSCLE3 attempts to have clear error
+    messages to explain what went wrong and give you pointers to a solution.
+
+    When the error message indicates a problem with the implementation of the
+    checkpointing API, please check with the developer of the component to fix
+    this. If you are the developer of the component, please see the
+    :ref:`Developer tutorial` section for additional resources.
+
+2. The simulation crashes when resuming.
+    Some common causes for this are:
+
+    -   The snapshot files that the instances are resuming from no longer exist.
+        This could for example happen when a previous run directory has been
+        moved or deleted. For distributed execution, some compute nodes may not
+        be able to access the directories where the instance snapshots are
+        stored. See also :ref:`Resuming a simulation`.
+    -   Your simulation configuration has incompatible changes compared to the
+        original simulation that the snapshots were from. See :ref:`Making
+        changes to your simulation`. Luckily, MUSCLE3 stores the previous
+        simulation configuration in the run directory. If the snapshot that your
+        resume from is stored in ``run1/snapshots/snapshot_xyz.ymmsl``, then you
+        can find that configuration in ``run1/configuration.ymmsl``. Try
+        resuming with that configuration first to see if this is the real
+        problem::
+            $ muscle_manager --run-dir run2 run1/configuration.ymmsl run1/snapshots/snapshot_xyz.ymmsl
+    -   One of your components has a bug with resuming from a previous snapshot,
+        or perhaps your snapshot belonged to a different version of the
+        component. Please ask your component developer(s) for help.
+
+
+
+Developer tutorial
+------------------
+
+TODO
+
+
+Checkpointing deep-dive
+-----------------------
+
+TODO
diff --git a/docs/source/examples/rd_checkpoints.ymmsl b/docs/source/examples/rd_checkpoints.ymmsl
index 82dfd638..931d66bb 100644
--- a/docs/source/examples/rd_checkpoints.ymmsl
+++ b/docs/source/examples/rd_checkpoints.ymmsl
@@ -26,6 +26,8 @@ resources:
   micro:
     threads: 1
 
+# Note: below three lines are explicitly mentioned in checkpointing.rst. Do not
+# forget to update that literalinclude when the line numbers change!
 checkpoints:
   simulation_time:
   - every: 2.0e-05
diff --git a/docs/source/index.rst b/docs/source/index.rst
index ed55ba48..1e7c05d0 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -38,6 +38,7 @@ Cham.  `<https://doi.org/10.1007/978-3-030-50433-5_33>`_
    fortran
    mpi
    uncertainty_quantification
+   checkpointing
    python_api
    cpp_api
    fortran_api

From d13939a2cba1c4fefb9017a63cd5108c56f2b42a Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 6 Dec 2022 11:05:32 +0100
Subject: [PATCH 143/183] Change order of dependencies in tox.ini #139

---
 tox.ini | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tox.ini b/tox.ini
index 020ee9e2..2a98e75a 100644
--- a/tox.ini
+++ b/tox.ini
@@ -39,7 +39,7 @@ deps =
     six
     sphinx
     sphinx-fortran
-    sphinx-tabs
     sphinx_rtd_theme
+    sphinx-tabs
 commands = sphinx-build docs/source docs/build -bhtml
 

From b70dad04785616f80c006bca9a366409f8f10415 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Tue, 6 Dec 2022 11:07:02 +0100
Subject: [PATCH 144/183] Let tox generate requirements.txt on readthedocs

Also bump RTD python version to 3.8
---
 .readthedocs.yml      | 18 +++++++++++++-----
 docs/requirements.txt | 11 -----------
 2 files changed, 13 insertions(+), 16 deletions(-)
 delete mode 100644 docs/requirements.txt

diff --git a/.readthedocs.yml b/.readthedocs.yml
index c1febabb..2a63fe5e 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -2,8 +2,16 @@
 
 version: 2
 
-# Optionally set the version of Python and requirements required to build your docs
-python:
-  version: 3.6
-  install:
-    - requirements: docs/requirements.txt
+build:
+  os: "ubuntu-20.04"
+  tools:
+    python: "3.8"
+  jobs:
+    post_create_environment:
+      - pip install tox
+      # let tox set up a build environment
+      - tox -e docs --notest
+      # export tox installed packages as requirements for the install step
+      - .tox/docs/bin/pip freeze > docs/requirements.txt
+    post_install:
+      - pip install -r docs/requirements.txt
diff --git a/docs/requirements.txt b/docs/requirements.txt
deleted file mode 100644
index 98b8214f..00000000
--- a/docs/requirements.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-breathe
-click
-msgpack==0.6.1
-netifaces
-numpy>=1.12
-qcg-pilotjob
-six
-sphinx-fortran
-sphinx-tabs
-typing==3.6.6
-ymmsl

From cc89c2b8cec7a92c507fcb6baaf097fcd2742e73 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Wed, 7 Dec 2022 15:32:32 +0100
Subject: [PATCH 145/183] First draft of checkpointing developer tutorial

---
 docs/source/checkpointing.rst                 | 586 +++++++++++++++++-
 .../python/checkpointing_diffusion.py         |  10 +-
 .../examples/python/checkpointing_reaction.py |   8 +-
 docs/source/examples/python/reaction.py       |   1 -
 .../examples/python/stateless_reaction.py     |  39 ++
 .../templates/checkpointing_instance.py       |  54 ++
 docs/source/templates/instance.cpp            |  55 ++
 docs/source/templates/instance.f90            |  66 ++
 docs/source/templates/instance.py             |  39 ++
 .../checkpointing_diffusion_partial.py        | 112 ++++
 .../checkpointing_instance_partial.py         |  49 ++
 .../checkpointing_reaction_partial.py         |  47 ++
 12 files changed, 1051 insertions(+), 15 deletions(-)
 create mode 100644 docs/source/examples/python/stateless_reaction.py
 create mode 100644 docs/source/templates/checkpointing_instance.py
 create mode 100644 docs/source/templates/instance.cpp
 create mode 100644 docs/source/templates/instance.f90
 create mode 100644 docs/source/templates/instance.py
 create mode 100644 docs/source/tutorial_code/checkpointing_diffusion_partial.py
 create mode 100644 docs/source/tutorial_code/checkpointing_instance_partial.py
 create mode 100644 docs/source/tutorial_code/checkpointing_reaction_partial.py

diff --git a/docs/source/checkpointing.rst b/docs/source/checkpointing.rst
index 62f9952a..51ce7869 100644
--- a/docs/source/checkpointing.rst
+++ b/docs/source/checkpointing.rst
@@ -20,6 +20,10 @@ capabilities to your MUSCLE3 component. Finally, the :ref:`checkpointing
 deep-dive` describes in detail the (inner) working of checkpointing in MUSCLE3;
 though this level of detail is not required for general usage of the API.
 
+.. contents:: Contents
+    :local:
+    :depth: 1
+
 
 Glossary
 --------
@@ -46,6 +50,14 @@ Glossary
 User tutorial
 -------------
 
+This user tutorial explains all you need to know about checkpointing for running
+and resuming simulations. Some details are deliberately left out, though you
+can read all about those in the :ref:`developer tutorial` or :ref:`checkpointing
+deep-dive`.
+
+.. contents:: User totorial contents
+    :local:
+
 
 Defining checkpoints
 ````````````````````
@@ -208,7 +220,7 @@ rules to set checkpoint moments for these:
 
     yMMSL API reference: :external:py:class:`ymmsl.Checkpoints`,
     :external:py:class:`ymmsl.CheckpointAtRule`,
-    :external:py:class:`ymmsl.CheckpointRangeRule`
+    :external:py:class:`ymmsl.CheckpointRangeRule`.
 
 
 Simulation time checkpoints
@@ -395,8 +407,8 @@ Example: resuming the reaction-diffusion model
 
 To resume the reaction-diffusion model from a snapshot created in the
 :ref:`previous section <Example: running the reaction-diffusion model with
-checkpoints`, replace ``<date>`` and ``<time>`` in the following command to poin
-to the snapshot you want to resume from and execute it.
+checkpoints>`, replace ``<date>`` and ``<time>`` in the following command to
+point to the snapshot you want to resume from.
 
 .. code-block:: bash
     :caption: Resume from an earlier snapshot. Replace ``<date>`` and ``<time>`` to point to an actual snapshot file.
@@ -520,7 +532,9 @@ github.
         can find that configuration in ``run1/configuration.ymmsl``. Try
         resuming with that configuration first to see if this is the real
         problem::
+
             $ muscle_manager --run-dir run2 run1/configuration.ymmsl run1/snapshots/snapshot_xyz.ymmsl
+
     -   One of your components has a bug with resuming from a previous snapshot,
         or perhaps your snapshot belonged to a different version of the
         component. Please ask your component developer(s) for help.
@@ -530,10 +544,572 @@ github.
 Developer tutorial
 ------------------
 
-TODO
+This developer tutorial explains all you need to know about implementing
+checkpointing in your MUSCLE3 simulation component. If you're not a developer
+and want to learn how to define checkpoints and resume simulations, please have
+a look at the :ref:`user tutorial`.
+
+Some details are deliberately left out in this developer tutorial, though you
+can read all about those in the :ref:`checkpointing deep-dive`.
+
+.. contents:: Developer totorial contents
+    :local:
+
+
+Start situation: components without checkpointing
+`````````````````````````````````````````````````
+
+In this tutorial we will add checkpointing to the reaction and diffusion
+components from the :ref:`Python <Tutorial with Python>`, :ref:`C++ <MUSCLE and C++>`
+and :ref:`Fortran <MUSCLE and Fortran>` tutorials.
+
+Additionally, we will do the same for a generic MUSCLE3 component template.
+These templates illustrate the structure of a MUSCLE3 component, but they are
+not complete and cannot be executed.
+
+.. tabs::
+
+    .. group-tab:: Reaction model
+
+        .. tabs::
+
+            .. group-tab:: Python
+
+                .. literalinclude:: examples/python/reaction.py
+                    :caption: ``docs/source/examples/python/reaction.py``
+                    :language: python
+
+            .. group-tab:: C++
+
+                .. literalinclude:: examples/cpp/reaction.cpp
+                    :caption: ``docs/source/examples/cpp/reaction.cpp``
+                    :language: c++
+
+            .. group-tab:: Fortran
+
+                .. literalinclude:: examples/fortran/reaction.f90
+                    :caption: ``docs/source/examples/fortran/reaction.f90``
+                    :language: fortran
+
+    .. group-tab:: Diffusion model
+
+        .. tabs::
+
+            .. group-tab:: Python
+
+                .. literalinclude:: examples/python/diffusion.py
+                    :caption: ``docs/source/examples/python/diffusion.py``
+                    :language: python
+
+            .. group-tab:: C++
+
+                .. literalinclude:: examples/cpp/diffusion.cpp
+                    :caption: ``docs/source/examples/cpp/diffusion.cpp``
+                    :language: c++
+
+            .. group-tab:: Fortran
+
+                .. literalinclude:: examples/fortran/diffusion.f90
+                    :caption: ``docs/source/examples/fortran/diffusion.f90``
+                    :language: fortran
+
+    .. group-tab:: Generic template
+
+        .. tabs::
+
+            .. group-tab:: Python
+
+                .. literalinclude:: templates/instance.py
+                    :caption: ``docs/source/templates/instance.py``
+                    :language: python
+
+            .. group-tab:: C++
+
+                .. literalinclude:: templates/instance.cpp
+                    :caption: ``docs/source/templates/instance.cpp``
+                    :language: c++
+
+            .. group-tab:: Fortran
+
+                .. literalinclude:: templates/instance.f90
+                    :caption: ``docs/source/templates/instance.f90``
+                    :language: fortran
+
+
+Step 1: implement checkpoint hooks
+``````````````````````````````````
+
+The first step in implementing the checkpointing API is implementing the
+checkpoint hooks. These are the points where your component can make
+checkpoints:
+
+1.  :ref:`Intermediate snapshots`
+
+    Intermediate snapshots are taken inside the reuse-loop, **immediately
+    after** the ``S`` Operator of your component.
+
+2.  :ref:`Final snapshots`
+
+    Final snapshots are taken at the **end of the reuse-loop**, after the
+    ``O_F`` Operator of your component.
+
+
+Intermediate snapshots
+''''''''''''''''''''''
+
+Intermediate snapshots are taken inside the reuse-loop, **immediately after**
+the ``S`` Operator of your component.
+
+Taking intermediate snapshots is optional. However, we recommend implementing
+intermediate snapshots when any of the following points holds for your
+component:
+
+1.  Your component has a loop containing ``O_I`` and ``S``, and you communicate
+    during Operator ``O_I`` or Operator ``S``.
+
+    Implementing intermediate checkpointing allows submodels connected to your
+    component to also create checkpoints.
+
+    .. warning::
+
+        If you do not implement intermediate checkpoints in this case, then it
+        is likely that a large amount of user-provided checkpoints will not lead
+        to consistent :term:`workflow snapshots <workflow snapshot>`. Please
+        implement intermediate snapshots to give the users of your component a
+        good checkpointing experience.
+
+2.  There is no communication during ``O_I`` and ``S``, but the state update
+    ``S`` is executed in a (time-integration) loop which takes a relatively long
+    time.
+
+    In this case, intermediate checkpointing allows users to create checkpoints
+    of your component during long-running computations.
+
+In all other cases, there usually is little or no added value in implementing
+intermediate snapshots in addition to :ref:`Final snapshots`.
+
+You implement taking intermediate snapshots as follows:
+
+1.  Find out where in your code to implement the checkpointing calls. Typically
+    there is a state update loop (e.g. a ``while`` or ``for`` loop) in a
+    component. You should implement the checkpointing calls **at the end** of
+    this state update loop. In this way, your code can resume immediately at the
+    begin of that loop. This allows for consistent restarts with the least
+    amount of code.
+2.  Ask libmuscle if you need to store your state and create an intermediate
+    snapshot with the API call ``should_save_snapshot(t)``. You must provide the
+    current time ``t`` in your simulation, such that MUSCLE3 can determine if
+    :ref:`Simulation time checkpoints` are triggered.
+3.  Collect the state that you need to store.
+4.  Create a ``libmuscle.Message`` object to put your state in.
+5.  Store the snapshot Message with the API call ``save_snapshot(message)``.
+
+See :ref:`Example: implemented checkpoint hooks` for example implementations in
+the reaction-diffusion models and the component template.
+
+.. seealso::
+    Python API documentation:
+    :py:meth:`~libmuscle.Instance.should_save_snapshot`,
+    :py:meth:`~libmuscle.Instance.save_snapshot`.
+
+
+Final snapshots
+'''''''''''''''
+
+Final snapshots **must** be implemented by all components supporting
+checkpointing. You implement taking final snapshot as follows:
+
+1.  You must implement the checkpoint calls at the end of the :ref:`reuse loop
+    <The reuse loop>`.
+2.  Ask libmuscle if you need to store your state and create a final snapshot
+    with the API call ``should_save_final_checkpoint()``. Contrary to the
+    intermediate checkpoints, this call may block to determine if a checkpoint
+    is needed (this is also the reason it must happen at the end of the reuse
+    loop).
+3.  Collect the state that you need to store.
+4.  Create a ``libmuscle.Message`` object to put your state in.
+5.  Store the snapshot Message with the API call
+    ``save_final_snapshot(message)``.
+
+See :ref:`Example: implemented checkpoint hooks` for example implementations in
+the reaction-diffusion models and the component template.
+
+.. seealso::
+    Python API documentation:
+    :py:meth:`~libmuscle.Instance.should_save_final_snapshot`,
+    :py:meth:`~libmuscle.Instance.save_final_snapshot`.
+
+
+Example: implemented checkpoint hooks
+'''''''''''''''''''''''''''''''''''''
+
+Note that below examples only shows the changes compared to the :ref:`start
+situation <Start situation: components without checkpointing>`. You can view the
+full contents of the files in the git repository.
+
+.. tabs::
+
+    .. group-tab:: Reaction model
+
+        .. rubric:: Intermediate snapshots
+
+        The state we need to store consists of three parts: the current ``U``,
+        the current time ``t_cur`` and the end-time for the time integration
+        ``t_stop``. The current time is stored as the ``timestamp`` attribute of
+        the ``Message`` object. The rest is stored in ``Message.data``.
+
+        .. rubric:: Final snapshots
+
+        For the final snapshot there is no state that is required for resuming.
+        The complete state will be received with the next message on the
+        ``initial_state`` port.
+
+        .. tabs::
+
+            .. group-tab:: Python
+
+                .. literalinclude:: tutorial_code/checkpointing_reaction_partial.py
+                    :caption: ``docs/source/tutorial_code/checkpointing_reaction_partial.py``
+                    :language: python
+                    :diff: examples/python/reaction.py
+
+            ..
+                group-tab:: C++
+
+                .. literalinclude:: examples/cpp/reaction.cpp
+                    :caption: ``docs/source/examples/cpp/reaction.cpp``
+                    :language: c++
+
+                TODO
+
+            ..
+                group-tab:: Fortran
+
+                .. literalinclude:: examples/fortran/reaction.f90
+                    :caption: ``docs/source/examples/fortran/reaction.f90``
+                    :language: fortran
+
+                TODO
+
+    .. group-tab:: Diffusion model
+
+        .. rubric:: Intermediate snapshots
+
+        The state we need to store consists of two parts: the current time
+        ``t_cur`` and the history of ``U``: ``Us``. Note that the last value of
+        ``U`` is contained in ``Us``, so we do not need to save ``U``
+        explicitly. The current time is stored as the ``timestamp`` attribute of
+        the ``Message`` object. ``Us`` is stored in ``Message.data``.
+
+        .. rubric:: Final snapshots
+
+        The same state is stored as for intermediate snapshots.
+
+        .. tabs::
+
+            .. group-tab:: Python
+
+                .. literalinclude:: tutorial_code/checkpointing_diffusion_partial.py
+                    :caption: ``docs/source/tutorial_code/checkpointing_diffusion_partial.py``
+                    :language: python
+                    :diff: examples/python/diffusion.py
+
+            ..
+                group-tab:: C++
+
+                .. literalinclude:: examples/cpp/diffusion.cpp
+                    :caption: ``docs/source/examples/cpp/diffusion.cpp``
+                    :language: c++
+
+                TODO
+
+            ..
+                group-tab:: Fortran
+
+                .. literalinclude:: examples/fortran/diffusion.f90
+                    :caption: ``docs/source/examples/fortran/diffusion.f90``
+                    :language: fortran
+
+                TODO
+
+    .. group-tab:: Generic template
+
+        .. tabs::
+
+            .. group-tab:: Python
+
+                .. literalinclude:: tutorial_code/checkpointing_instance_partial.py
+                    :caption: ``docs/source/tutorial_code/checkpointing_instance_partial.py``
+                    :language: python
+                    :diff: templates/instance.py
+
+            ..
+                group-tab:: C++
+
+                .. literalinclude:: templates/instance.cpp
+                    :caption: ``docs/source/templates/instance.cpp``
+                    :language: c++
+
+                TODO
+
+            ..
+                group-tab:: Fortran
+
+                .. literalinclude:: templates/instance.f90
+                    :caption: ``docs/source/templates/instance.f90``
+                    :language: fortran
+
+                TODO
+
+
+Step 2: implement resume
+````````````````````````
+
+Now that the checkpoint hooks are implemented, we can add support for resuming
+from a previously created checkpoint. When resuming, there are two options:
+resuming from an intermediate checkpoint and resuming from a final checkpoint.
+
+When resuming from an intermediate checkpoint, your component first loads its
+state from the checkpoint. However, we cannot just continue with the ``F_INIT``
+Operator, and instead we need to skip ahead to the point where the checkpoint
+was taken.
+
+When resuming from a final checkpoint, your component first loads its state from
+the checkpoint. Next, your component executes the ``F_INIT`` operator as usual.
+
+Steps to implement the resumption logic:
+
+1.  At the start of -- but inside -- the reuse loop you check if you need to
+    resume from a previous snapshot with the API call ``resuming()``.
+
+    .. note::
+
+        This takes place inside the reuse loop. Currently resuming can only
+        happen during the first iteration of the reuse loop. However, additional
+        checkpointing features are planned that would allow a model to resume
+        multiple times inside one run. By implementing the resume logic inside
+        the reuse loop, your component will be forwards-compatible with this.
+
+2.  When resuming, you load the previously stored snapshot with
+    ``load_snapshot()`` and restore the state of your component.
+3.  Afterwards check if initialization is required with ``should_init()`` and
+    run the regular initialization logic.
+4.  Continue with the time-integration loop.
+
+See :ref:`Example: implemented checkpoint hooks and resume` for example
+implementations in the reaction-diffusion models and the component template.
+
+.. seealso::
+    Python API documentation: :py:meth:`~libmuscle.Instance.resuming`,
+    :py:meth:`~libmuscle.Instance.load_snapshot`,
+    :py:meth:`~libmuscle.Instance.should_init`.
+
+Reload settings when resuming
+'''''''''''''''''''''''''''''
+
+You will notice in the :ref:`examples <Example: implemented checkpoint hooks and
+resume>` that the resume logic is not executed first in the reuse-loop.
+Instead, the components all retrieve settings. The reason behind this is that it
+allows the user to resume a simulation with slightly different settings and have
+those settings take effect immediately after resuming.
+
+It is not required to do this, so you get to decide if (and when) you reload
+settings after resuming. Be sure to include the behaviour of your component in
+the documentation, such that users of your component know what they can expect.
+
+
+Example: implemented checkpoint hooks and resume
+''''''''''''''''''''''''''''''''''''''''''''''''
+
+Note that below examples only shows the changes compared to the :ref:`start
+situation <Start situation: components without checkpointing>`. You can view the
+full contents of the files in the git repository.
+
+
+.. tabs::
+
+    .. group-tab:: Reaction model
+
+        .. rubric:: Resume logic
+
+        In :ref:`Example: implemented checkpoint hooks` we made the choice to
+        store different data in the message for intermediate and final
+        snapshots. When resuming we therefore need to handle these two cases.
+
+        .. tabs::
+
+            .. group-tab:: Python
+
+                .. literalinclude:: examples/python/checkpointing_reaction.py
+                    :caption: ``docs/source/examples/python/checkpointing_reaction.py``
+                    :language: python
+                    :diff: examples/python/reaction.py
+
+            ..
+                group-tab:: C++
+
+                .. literalinclude:: examples/cpp/reaction.cpp
+                    :caption: ``docs/source/examples/cpp/reaction.cpp``
+                    :language: c++
+
+                TODO
+
+            ..
+                group-tab:: Fortran
+
+                .. literalinclude:: examples/fortran/reaction.f90
+                    :caption: ``docs/source/examples/fortran/reaction.f90``
+                    :language: fortran
+
+                TODO
+
+    .. group-tab:: Diffusion model
+
+        .. rubric:: Resume logic
+
+        For the diffusion model we stored the same state for intermediate and
+        final snapshots. This makes resuming easier because we do not have to
+        distinguish between the data stored in the loaded ``Message`` object.
+
+        .. tabs::
+
+            .. group-tab:: Python
+
+                .. literalinclude:: examples/python/checkpointing_diffusion.py
+                    :caption: ``docs/source/examples/python/checkpointing_diffusion.py``
+                    :language: python
+                    :diff: examples/python/diffusion.py
+
+            ..
+                group-tab:: C++
+
+                .. literalinclude:: examples/cpp/diffusion.cpp
+                    :caption: ``docs/source/examples/cpp/diffusion.cpp``
+                    :language: c++
+
+                TODO
+
+            ..
+                group-tab:: Fortran
+
+                .. literalinclude:: examples/fortran/diffusion.f90
+                    :caption: ``docs/source/examples/fortran/diffusion.f90``
+                    :language: fortran
+
+                TODO
+
+    .. group-tab:: Generic template
+
+        .. tabs::
+
+            .. group-tab:: Python
+
+                .. literalinclude:: templates/checkpointing_instance.py
+                    :caption: ``docs/source/templates/checkpointing_instance.py``
+                    :language: python
+                    :diff: templates/instance.py
+
+            ..
+                group-tab:: C++
+
+                .. literalinclude:: templates/instance.cpp
+                    :caption: ``docs/source/templates/instance.cpp``
+                    :language: c++
+
+                TODO
+
+            ..
+                group-tab:: Fortran
+
+                .. literalinclude:: templates/instance.f90
+                    :caption: ``docs/source/templates/instance.f90``
+                    :language: fortran
+
+                TODO
+
+
+Stateless components
+````````````````````
+
+Some components do not need to store data between reuses. An example of that is
+the reaction model from above examples. In the final snapshot, no state needs
+to be stored to allow properly resuming this component, see
+:ref:`Example: implemented checkpoint hooks`. These components are called
+stateless.
+
+.. note::
+
+    *Stateless* in this context means that the component does not store any
+    state between iterations of the reuse loop. Although there is a state inside
+    the loop (e.g. ``cur_t`` and ``U``), the component is still considered
+    *stateless*.
+
+Other examples of stateless components may be data transformers, receiving data
+on an ``F_INIT`` port and sending the converted data on an ``O_F`` port.
+
+If you indicate to libmuscle that these components are *stateless*, libmuscle
+automatically provides checkpointing for your component. See below example for
+a stateless variant of the example reaction model.
+
+.. tabs::
+
+    .. group-tab:: Python
+
+        .. literalinclude:: examples/python/stateless_reaction.py
+            :caption: ``docs/source/examples/python/stateless_reaction.py``
+            :language: python
+            :diff: examples/python/reaction.py
+
+    ..
+        group-tab:: C++
+
+        .. literalinclude:: examples/cpp/reaction.cpp
+            :caption: ``docs/source/examples/cpp/reaction.cpp``
+            :language: c++
+
+        TODO
+
+    ..
+        group-tab:: Fortran
+
+        .. literalinclude:: examples/fortran/reaction.f90
+            :caption: ``docs/source/examples/fortran/reaction.f90``
+            :language: fortran
+
+        TODO
+
+.. seealso::
+    Python API documentation: :py:class:`libmuscle.Instance`.
+
+
+Builtin validation
+``````````````````
+
+MUSCLE3's checkpointing API was carefully designed to allow consistenly resuming
+a simulation. This is only possible when components carefully implement the
+checkpointing API. To support you in this task, MUSCLE3 tries to detect any
+issues with the checkpointing implementation. When MUSCLE3 detects a problem, an
+error is raised to indicate what went wrong and point you in the right direction
+for fixing the problem.
+
+Some of these errors can be turned into a warning by setting the environment
+variable ``MUSCLE_DISABLE_CHECKPOINT_VALIDATION`` to any value. Note that it is
+discouraged to use this: the errors are there to ensure proper exeuction and
+resuming. Not adhering to the rules may lead to deadlocks or crashes of your
+simulations.
+
+..
+    TODO: create an overview of the validation rules
 
 
 Checkpointing deep-dive
 -----------------------
 
-TODO
+This checkpointing deep-dive explains the details of the distributed
+checkpointing implemented in MUSCLE3. Usually you will not need to read or
+understand these details when you want to run simulations with checkpointing
+(see :ref:`User tutorial`) or implement checkpointing in a MUSCLE3 component
+(see :ref:`Developer tutorial`).
+
+.. contents:: Checkpointing deep-dive contents
+    :local:
diff --git a/docs/source/examples/python/checkpointing_diffusion.py b/docs/source/examples/python/checkpointing_diffusion.py
index 067858ed..2a676fa2 100644
--- a/docs/source/examples/python/checkpointing_diffusion.py
+++ b/docs/source/examples/python/checkpointing_diffusion.py
@@ -7,7 +7,7 @@
 from ymmsl import Operator
 
 
-def laplacian(Z: np.ndarray, dx: float) -> np.ndarray:
+def laplacian(Z: np.array, dx: float) -> np.array:
     """Calculates the Laplacian of vector Z.
 
     Args:
@@ -46,8 +46,8 @@ def diffusion() -> None:
 
         if instance.resuming():
             msg = instance.load_snapshot()
-            U = msg.data[0].array.copy()
-            Us = msg.data[1].array.copy()
+            Us = msg.data.array.copy()
+            U = Us[-1]
             t_cur = msg.timestamp
 
         if instance.should_init():
@@ -82,7 +82,7 @@ def diffusion() -> None:
             t_cur += dt
 
             if instance.should_save_snapshot(t_cur):
-                msg = Message(t_cur, None, [Grid(U), Grid(Us)])
+                msg = Message(t_cur, None, Grid(Us))
                 instance.save_snapshot(msg)
 
         # O_F
@@ -109,7 +109,7 @@ def diffusion() -> None:
             plt.show()
 
         if instance.should_save_final_snapshot():
-            msg = Message(t_cur, None, [Grid(U), Grid(Us)])
+            msg = Message(t_cur, None, Grid(Us))
             instance.save_final_snapshot(msg)
 
 
diff --git a/docs/source/examples/python/checkpointing_reaction.py b/docs/source/examples/python/checkpointing_reaction.py
index 07204e64..6e43db30 100644
--- a/docs/source/examples/python/checkpointing_reaction.py
+++ b/docs/source/examples/python/checkpointing_reaction.py
@@ -40,15 +40,15 @@ def reaction() -> None:
             t_cur += dt
 
             if instance.should_save_snapshot(t_cur):
-                instance.save_snapshot(Message(t_cur, None, [
-                        Grid(U, ['x']),
-                        t_stop]))
+                msg = Message(t_cur, None, [Grid(U, ['x']), t_stop])
+                instance.save_snapshot(msg)
 
         # O_F
         instance.send('final_state', Message(t_cur, None, Grid(U, ['x'])))
 
         if instance.should_save_final_snapshot():
-            instance.save_final_snapshot(Message(t_cur, None, None))
+            msg = Message(t_cur, None, None)
+            instance.save_final_snapshot(msg)
 
 
 if __name__ == '__main__':
diff --git a/docs/source/examples/python/reaction.py b/docs/source/examples/python/reaction.py
index aad03ba0..fb0b6f8f 100644
--- a/docs/source/examples/python/reaction.py
+++ b/docs/source/examples/python/reaction.py
@@ -1,5 +1,4 @@
 import logging
-import numpy as np
 
 from libmuscle import Grid, Instance, Message
 from ymmsl import Operator
diff --git a/docs/source/examples/python/stateless_reaction.py b/docs/source/examples/python/stateless_reaction.py
new file mode 100644
index 00000000..9a3f1b1b
--- /dev/null
+++ b/docs/source/examples/python/stateless_reaction.py
@@ -0,0 +1,39 @@
+import logging
+
+from libmuscle import Grid, Instance, Message
+from ymmsl import Operator, ImplementationState
+
+
+def reaction() -> None:
+    """A simple exponential reaction model on a 1D grid.
+    """
+    instance = Instance({
+            Operator.F_INIT: ['initial_state'],     # list of float
+            Operator.O_F: ['final_state']},         # list of float
+            stateful=ImplementationState.STATELESS)
+
+    while instance.reuse_instance():
+        # F_INIT
+        t_max = instance.get_setting('t_max', 'float')
+        dt = instance.get_setting('dt', 'float')
+        k = instance.get_setting('k', 'float')
+
+        msg = instance.receive('initial_state')
+        U = msg.data.array.copy()
+
+        t_cur = msg.timestamp
+        while t_cur + dt < msg.timestamp + t_max:
+            # O_I
+
+            # S
+            U += k * U * dt
+            t_cur += dt
+
+        # O_F
+        instance.send('final_state', Message(t_cur, None, Grid(U, ['x'])))
+
+
+if __name__ == '__main__':
+    logging.basicConfig()
+    logging.getLogger().setLevel(logging.INFO)
+    reaction()
diff --git a/docs/source/templates/checkpointing_instance.py b/docs/source/templates/checkpointing_instance.py
new file mode 100644
index 00000000..6c54390b
--- /dev/null
+++ b/docs/source/templates/checkpointing_instance.py
@@ -0,0 +1,54 @@
+"""MUSCLE3 Python component template.
+
+Note that this template is not executable as is, please have a look at the
+examples in ``docs/source/examples`` to see working components."""
+
+from libmuscle import Instance, Message
+from ymmsl import Operator
+
+instance = Instance({
+        Operator.F_INIT: ["F_INIT_Port"],
+        Operator.O_I: ["O_I_Port"],
+        Operator.S: ["S_Port"],
+        Operator.O_F: ["O_F_Port"]})
+
+while instance.reuse_instance():
+    # F_INIT
+    setting = instance.get_setting("setting")
+    ...
+
+    if instance.resuming():
+        msg = instance.load_snapshot()
+        ...  # restore state from message
+
+    if instance.should_init():
+        instance.receive("F_INIT_Port")
+        ...
+
+    while t_cur <= t_max:
+        # O_I
+        t_next = t + dt
+        if t_next > t_max:
+            t_next = None
+
+        instance.send("O_I_Port", Message(t_cur, t_next, data))
+        ...
+
+        # S
+        instance.receive("S_Port")
+        ...
+
+        t_cur += dt
+
+        if instance.should_save_snapshot(t_cur):
+            state = ...  # component implementation
+            msg = Message(t_cur, None, state)
+            instance.save_snapshot(msg)
+
+    # O_F
+    instance.send("O_F_Port", Message(t_cur, None, data))
+
+    if instance.should_save_final_snapshot():
+        state = ...  # component implementation
+        msg = Message(t_cur, None, state)
+        instance.save_final_snapshot(msg)
diff --git a/docs/source/templates/instance.cpp b/docs/source/templates/instance.cpp
new file mode 100644
index 00000000..4913f9a1
--- /dev/null
+++ b/docs/source/templates/instance.cpp
@@ -0,0 +1,55 @@
+#include <cstdlib>
+#include <vector>
+
+#include <libmuscle/libmuscle.hpp>
+#include <ymmsl/ymmsl.hpp>
+
+
+using libmuscle::Instance;
+using libmuscle::Message;
+using ymmsl::Operator;
+
+
+/**
+ * MUSCLE3 C++ component template.
+ *
+ * Note that this template is not executable as is, please have a look at the
+ * examples in ``docs/source/examples`` to see working components.
+ */
+int main(int argc, char * argv[]) {
+    Instance instance(argc, argv, {
+            {Operator::F_INIT, {"F_INIT_Port"}},
+            {Operator::O_I, {"O_I_Port"}},
+            {Operator::S, {"O_S_Port"}},
+            {Operator::O_F, {"O_F_Port"}}});
+
+    while (instance.reuse_instance()) {
+        // F_INIT
+        auto setting = instance.get_setting("setting");
+        // ...
+
+        instance.receive("F_INIT_Port");
+        // ...
+
+        while (t_cur <= t_max) {
+            // O_I
+            Message msg(t_cur, data);
+            if(t_cur + dt <= t_max)
+                msg.set_next_timestamp(t_cur + dt);
+            instance.send("O_I_Port", msg);
+            // ...
+
+            // S
+            instance.receive("S_Port");
+            // ...
+
+            t_cur += dt;
+        }
+
+        // O_F
+        instance.send("final_state", Message(t_cur, data));
+    }
+
+    return EXIT_SUCCESS;
+}
+
diff --git a/docs/source/templates/instance.f90 b/docs/source/templates/instance.f90
new file mode 100644
index 00000000..93b14db2
--- /dev/null
+++ b/docs/source/templates/instance.f90
@@ -0,0 +1,66 @@
+program instance
+    ! MUSCLE3 Fortran component template.
+    !
+    ! Note that this template is not executable as is, please have a look at the
+    ! examples in ``docs/source/examples`` to see working components.
+
+    use ymmsl
+    use libmuscle
+    implicit none
+
+    type(LIBMUSCLE_PortsDescription) :: ports
+    type(LIBMUSCLE_Instance) :: instance
+
+    type(LIBMUSCLE_Message) :: rmsg, smsg
+    type(LIBMUSCLE_Data) :: data
+
+    real (selected_real_kind(15)) :: t_cur, t_max, dt
+
+
+    ports = LIBMUSCLE_PortsDescription_create()
+    call LIBMUSCLE_PortsDescription_add(ports, YMMSL_Operator_F_INIT, 'F_INIT_Port')
+    call LIBMUSCLE_PortsDescription_add(ports, YMMSL_Operator_O_I, 'O_I_Port')
+    call LIBMUSCLE_PortsDescription_add(ports, YMMSL_Operator_S, 'S_Port')
+    call LIBMUSCLE_PortsDescription_add(ports, YMMSL_Operator_O_F, 'O_F_Port')
+    instance = LIBMUSCLE_Instance_create(ports)
+    call LIBMUSCLE_PortsDescription_free(ports)
+
+    do while (LIBMUSCLE_Instance_reuse_instance(instance))
+        ! F_INIT
+        setting = LIBMUSCLE_Instance_get_setting_as_real8(instance, 'setting')
+        ! ...
+
+        rmsg = LIBMUSCLE_Instance_receive(instance, 'F_INIT')
+        call LIBMUSCLE_Message_free(rmsg)
+        ! ...
+
+        do while (t_cur <= t_max)
+            ! O_I
+            smsg = LIBMUSCLE_Message_create(t_cur, data)
+            if (t_cur + dt <= t_max) then
+                call LIBMUSCLE_Message_set_next_timestamp(smsg, t_cur + dt)
+            end if
+            call LIBMUSCLE_Instance_send(instance, 'O_I', smsg)
+            call LIBMUSCLE_Message_free(smsg)
+            ! ...
+
+            ! S
+            rmsg = LIBMUSCLE_Instance_receive(instance, 'S')
+            call LIBMUSCLE_Message_free(rmsg)
+            ! ...
+
+            t_cur = t_cur + dt
+        end do
+
+        ! O_F
+        smsg = LIBMUSCLE_Message_create(t_cur, data)
+        call LIBMUSCLE_Instance_send(instance, 'final_state', smsg)
+        call LIBMUSCLE_Message_free(smsg)
+        ! ...
+
+    end do
+
+    call LIBMUSCLE_Instance_free(instance)
+
+end program reaction
+
diff --git a/docs/source/templates/instance.py b/docs/source/templates/instance.py
new file mode 100644
index 00000000..0e291dec
--- /dev/null
+++ b/docs/source/templates/instance.py
@@ -0,0 +1,39 @@
+"""MUSCLE3 Python component template.
+
+Note that this template is not executable as is, please have a look at the
+examples in ``docs/source/examples`` to see working components."""
+
+from libmuscle import Instance, Message
+from ymmsl import Operator
+
+instance = Instance({
+        Operator.F_INIT: ["F_INIT_Port"],
+        Operator.O_I: ["O_I_Port"],
+        Operator.S: ["S_Port"],
+        Operator.O_F: ["O_F_Port"]})
+
+while instance.reuse_instance():
+    # F_INIT
+    setting = instance.get_setting("setting")
+    ...
+
+    instance.receive("F_INIT_Port")
+    ...
+
+    while t_cur <= t_max:
+        # O_I
+        t_next = t + dt
+        if t_next > t_max:
+            t_next = None
+
+        instance.send("O_I_Port", Message(t_cur, t_next, data))
+        ...
+
+        # S
+        instance.receive("S_Port")
+        ...
+
+        t_cur += dt
+
+    # O_F
+    instance.send("O_F_Port", Message(t_cur, None, data))
diff --git a/docs/source/tutorial_code/checkpointing_diffusion_partial.py b/docs/source/tutorial_code/checkpointing_diffusion_partial.py
new file mode 100644
index 00000000..9b622375
--- /dev/null
+++ b/docs/source/tutorial_code/checkpointing_diffusion_partial.py
@@ -0,0 +1,112 @@
+import logging
+import os
+
+import numpy as np
+
+from libmuscle import Grid, Instance, Message
+from ymmsl import Operator
+
+
+def laplacian(Z: np.array, dx: float) -> np.array:
+    """Calculates the Laplacian of vector Z.
+
+    Args:
+        Z: A vector representing a series of samples along a line.
+        dx: The spacing between the samples.
+
+    Returns:
+        The second spatial derivative of Z.
+    """
+    Zleft = Z[:-2]
+    Zright = Z[2:]
+    Zcenter = Z[1:-1]
+    return (Zleft + Zright - 2. * Zcenter) / dx**2
+
+
+def diffusion() -> None:
+    """A simple diffusion model on a 1d grid.
+
+    The state of this model is a 1D grid of concentrations. It sends
+    out the state on each timestep on `state_out`, and can receive an
+    updated state on `state_in` at each state update.
+    """
+    logger = logging.getLogger()
+    instance = Instance({
+            Operator.O_I: ['state_out'],
+            Operator.S: ['state_in'],
+            Operator.O_F: ['final_state_out']})
+
+    while instance.reuse_instance():
+        # F_INIT
+        t_max = instance.get_setting('t_max', 'float')
+        dt = instance.get_setting('dt', 'float')
+        x_max = instance.get_setting('x_max', 'float')
+        dx = instance.get_setting('dx', 'float')
+        d = instance.get_setting('d', 'float')
+
+        U = np.zeros(int(round(x_max / dx))) + 1e-20
+        U[25] = 2.0
+        U[50] = 2.0
+        U[75] = 2.0
+        Us = U
+
+        t_cur = 0.0
+        while t_cur + dt <= t_max:
+            # O_I
+            t_next = t_cur + dt
+            if t_next + dt > t_max:
+                t_next = None
+            cur_state_msg = Message(t_cur, t_next, Grid(U, ['x']))
+            instance.send('state_out', cur_state_msg)
+
+            # S
+            msg = instance.receive('state_in', default=cur_state_msg)
+            if msg.timestamp > t_cur + dt:
+                logger.warning('Received a message from the future!')
+            np.copyto(U, msg.data.array)
+
+            dU = np.zeros_like(U)
+            dU[1:-1] = d * laplacian(U, dx) * dt
+            dU[0] = dU[1]
+            dU[-1] = dU[-2]
+
+            U += dU
+            Us = np.vstack((Us, U))
+            t_cur += dt
+
+            if instance.should_save_snapshot(t_cur):
+                msg = Message(t_cur, None, Grid(Us))
+                instance.save_snapshot(msg)
+
+        # O_F
+        final_state_msg = Message(t_cur, None, Grid(U, ['x']))
+        instance.send('final_state_out', final_state_msg)
+
+        if 'DONTPLOT' not in os.environ and 'SLURM_NODENAME' not in os.environ:
+            from matplotlib import pyplot as plt
+            plt.figure()
+            plt.imshow(
+                    np.log(Us + 1e-20),
+                    origin='upper',
+                    extent=[
+                        -0.5*dx, x_max - 0.5*dx,
+                        (t_max - 0.5*dt) * 1000.0, -0.5*dt * 1000.0],
+                    interpolation='none',
+                    aspect='auto'
+                    )
+            cbar = plt.colorbar()
+            cbar.set_label('log(Concentration)', rotation=270, labelpad=20)
+            plt.xlabel('x')
+            plt.ylabel('t (ms)')
+            plt.title('Concentration over time')
+            plt.show()
+
+        if instance.should_save_final_snapshot():
+            msg = Message(t_cur, None, Grid(Us))
+            instance.save_final_snapshot(msg)
+
+
+if __name__ == '__main__':
+    logging.basicConfig()
+    logging.getLogger().setLevel(logging.INFO)
+    diffusion()
diff --git a/docs/source/tutorial_code/checkpointing_instance_partial.py b/docs/source/tutorial_code/checkpointing_instance_partial.py
new file mode 100644
index 00000000..31c9d550
--- /dev/null
+++ b/docs/source/tutorial_code/checkpointing_instance_partial.py
@@ -0,0 +1,49 @@
+"""MUSCLE3 Python component template.
+
+Note that this template is not executable as is, please have a look at the
+examples in ``docs/source/examples`` to see working components."""
+
+from libmuscle import Instance, Message
+from ymmsl import Operator
+
+instance = Instance({
+        Operator.F_INIT: ["F_INIT_Port"],
+        Operator.O_I: ["O_I_Port"],
+        Operator.S: ["S_Port"],
+        Operator.O_F: ["O_F_Port"]})
+
+while instance.reuse_instance():
+    # F_INIT
+    setting = instance.get_setting("setting")
+    ...
+
+    instance.receive("F_INIT_Port")
+    ...
+
+    while t_cur <= t_max:
+        # O_I
+        t_next = t + dt
+        if t_next > t_max:
+            t_next = None
+
+        instance.send("O_I_Port", Message(t_cur, t_next, data))
+        ...
+
+        # S
+        instance.receive("S_Port")
+        ...
+
+        t_cur += dt
+
+        if instance.should_save_snapshot(t_cur):
+            state = ...  # component implementation
+            msg = Message(t_cur, None, state)
+            instance.save_snapshot(msg)
+
+    # O_F
+    instance.send("O_F_Port", Message(t_cur, None, data))
+
+    if instance.should_save_final_snapshot():
+        state = ...  # component implementation
+        msg = Message(t_cur, None, state)
+        instance.save_final_snapshot(msg)
diff --git a/docs/source/tutorial_code/checkpointing_reaction_partial.py b/docs/source/tutorial_code/checkpointing_reaction_partial.py
new file mode 100644
index 00000000..efe4f629
--- /dev/null
+++ b/docs/source/tutorial_code/checkpointing_reaction_partial.py
@@ -0,0 +1,47 @@
+import logging
+
+from libmuscle import Grid, Instance, Message
+from ymmsl import Operator
+
+
+def reaction() -> None:
+    """A simple exponential reaction model on a 1D grid.
+    """
+    instance = Instance({
+            Operator.F_INIT: ['initial_state'],     # list of float
+            Operator.O_F: ['final_state']})         # list of float
+
+    while instance.reuse_instance():
+        # F_INIT
+        t_max = instance.get_setting('t_max', 'float')
+        dt = instance.get_setting('dt', 'float')
+        k = instance.get_setting('k', 'float')
+
+        msg = instance.receive('initial_state')
+        U = msg.data.array.copy()
+
+        t_cur = msg.timestamp
+        t_stop = msg.timestamp + t_max
+        while t_cur + dt < t_stop:
+            # O_I
+
+            # S
+            U += k * U * dt
+            t_cur += dt
+
+            if instance.should_save_snapshot(t_cur):
+                msg = Message(t_cur, None, [Grid(U, ['x']), t_stop])
+                instance.save_snapshot(msg)
+
+        # O_F
+        instance.send('final_state', Message(t_cur, None, Grid(U, ['x'])))
+
+        if instance.should_save_final_snapshot():
+            msg = Message(t_cur, None, None)
+            instance.save_final_snapshot(msg)
+
+
+if __name__ == '__main__':
+    logging.basicConfig()
+    logging.getLogger().setLevel(logging.INFO)
+    reaction()

From ea74160d0055af70443c5d46c0dd68a980a7dfa6 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 9 Dec 2022 15:14:56 +0100
Subject: [PATCH 146/183] Add first draft for checkpointing deep-dive docs

---
 docs/source/checkpointing.rst | 524 ++++++++++++++++++++++++++++++++++
 1 file changed, 524 insertions(+)

diff --git a/docs/source/checkpointing.rst b/docs/source/checkpointing.rst
index 51ce7869..355d404d 100644
--- a/docs/source/checkpointing.rst
+++ b/docs/source/checkpointing.rst
@@ -1113,3 +1113,527 @@ understand these details when you want to run simulations with checkpointing
 
 .. contents:: Checkpointing deep-dive contents
     :local:
+
+
+Consistency for simulation time checkpoints
+```````````````````````````````````````````
+
+In this section we take a look at the three allowed coupling types in the MMSF:
+call/release, interact and dispatch coupling. In the following sections
+we will analyze consistency for each of the coupling types.
+
+The underlying assumption is: if we can take consistent snapshots for each pair
+of coupled components, we can take consistent snapshots of the whole workflow.
+
+Call/release coupling
+'''''''''''''''''''''
+
+In this section we will look at the call/release coupling mode. The first
+example simulation consists of two components: Component 1 and Component 2. They
+are coupled as follows:
+
+-   The ``O_I`` port of Component 1 is connected to the ``F_INIT`` port of
+    Component 2.
+-   The ``O_F`` port of Component 2 is connected to the ``S`` port of
+    Component 1.
+
+.. code-block:: text
+    :caption: Example run for three iterations of Component 1.
+
+    Component 1:  |Fi|Oi|........ S |Oi|........ S |Oi|........ S |Of|
+                        \        /     \        /     \        /
+    Component 2:        |Fi|S |Of|..... Fi|S |Of|..... Fi|S |Of|
+
+The above schema shows the Operator (``F_INIT``, ``O_I``, ``S``, ``Of``) that
+each comonent is in during the run. The dots (``...``) indicate a blocking
+call: in this case it is the ``receive`` during the ``S`` operator of Component
+1, and the ``receive`` of the ``F_INIT`` operator of Component 2.
+
+Let's add the simulation time for each component on the example timeline.
+
+-   During ``F_INIT``, the internal time is initialized. Component 1 initializes
+    to a constant ``t0``. Component 2 initializes the time to the timestamp
+    received in the message.
+-   During ``S`` the state is updated and the simulation time may move forward.
+
+.. code-block:: text
+    :caption: Example run, also showing simulation times in the components.
+
+      time        |t0            |t2            |t4            |t6
+    Component 1:  |Fi|Oi|........ S |Oi|........ S |Oi|........ S |Of|
+                        \        /     \        /     \        /
+    Component 2:        |Fi|S |Of|..... Fi|S |Of|..... Fi|S |Of|
+      time              |t0|t1         |t2|t3         |t4|t5
+
+We assume that each component only moves forward in time, so
+:math:`t_0 \le t_2 \le t_4 \le t_6` and :math:`t_0 \le t_1`, :math:`t_2 \le t_3`
+and :math:`t_4 \le t_5`. The time evolution of Component 2 should be smaller
+than the time step of Component 1 in this coupling type. Therefore:
+:math:`t_1 \le t_2`, :math:`t_3 \le t_4` and :math:`t_5 \le t_6`.
+
+.. rubric:: Introducing checkpoints
+
+Component 1 can take checkpoints immediately after the ``S`` operator. Component
+2 can only take checkpoints after the ``O_F`` operator. Let's investigate what
+needs to happen when a checkpoint :math:`t_c` is requested for different values
+of :math:`t_c`:
+
+1.  :math:`t_c \leq t_0`
+2.  :math:`t_0 < t_c \leq t_1`
+3.  :math:`t_1 < t_c \leq t_2`
+4.  :math:`t_2 < t_c  \leq t_4`
+
+Note: a checkpoint :math:`t_4 < t_c  \leq t_6` would behave the same as scenario
+4, just at a later point in the simulation, so we won't work out later
+checkpoints in detail.
+
+.. tabs::
+
+    .. tab:: :math:`t_c \leq t_0`
+
+        Both components will take a snapshot at the earliest possible moment,
+        indicated with a ``C`` block in the timelines below.
+
+        You may notice that the ``C`` block in Component 2 is blocking. Although
+        the internal time of Component 2 already exceeded the checkpoint time,
+        :ref:`Final snapshots` actually determine if a snapshot should be
+        taken based on the message(s) arriving during the next ``F_INIT``.
+
+        .. rubric:: Consistency
+
+        Both snapshots have the same message counts: 1 message sent/received per
+        conduit. When resuming, Component 1 starts by sending a new message on
+        its ``O_I`` port, and Component 2 runs ``F_INIT`` as usual.
+
+        .. code-block:: text
+
+              time        |t0            |t2                  |t4            |t6
+            Component 1:  |Fi|Oi|........ S |C |Oi|........... S |Oi|........ S |Of|
+                                \        /        \           /     \        /
+            Component 2:        |Fi|S |Of|........ C |Fi|S |Of|..... Fi|S |Of|
+              time              |t0|t1               |t2|t3         |t4|t5
+
+    .. tab:: :math:`t_0 < t_c \leq t_1` and :math:`t_1 < t_c \leq t_2`
+
+        For both checkpoint times, a snapshot will be taken at the earliest
+        possible moment.
+
+        After the first ``S`` operator, Component 1 is at :math:`t=t_2` which is
+        after the checkpoint time, so it takes a snapshot. After the first reuse
+        loop, Component 2 receives a message with :math:`t=t_2` which is after
+        the checkpoint time, so it will take a snapshot at the end of the first
+        reuse loop.
+
+        .. rubric:: Consistency
+
+        Both snapshots have the same message counts: 1 message sent/received per
+        conduit. When resuming, Component 1 starts by sending a new message on
+        its ``O_I`` port, and Component 2 runs ``F_INIT`` as usual.
+
+        .. code-block:: text
+
+              time        |t0            |t2                  |t4            |t6
+            Component 1:  |Fi|Oi|........ S |C |Oi|........... S |Oi|........ S |Of|
+                                \        /        \           /     \        /
+            Component 2:        |Fi|S |Of|........ C |Fi|S |Of|..... Fi|S |Of|
+              time              |t0|t1               |t2|t3         |t4|t5
+
+    .. tab:: :math:`t_2 < t_c  \leq t_4`
+
+        Both components will take a snapshot at the earliest possible moment,
+        indicated with a ``C`` block in the timelines below.
+
+        After the first ``S`` operator, Component 1 is at :math:`t=t_2` which is
+        before the checkpoint time. After the second ``S`` operator it has
+        passed the checkpoint time, so it takes a snapshot. This works similarly
+        for Component 2.
+
+        .. rubric:: Consistency
+
+        Both snapshots have the same message counts: 2 messages sent/received
+        per conduit. When resuming, Component 1 starts by sending a new message
+        on its ``O_I`` port, and Component 2 runs ``F_INIT`` as usual.
+
+        .. code-block:: text
+
+              time        |t0            |t2            |t4                  |t6
+            Component 1:  |Fi|Oi|........ S |Oi|........ S |C |Oi|........... S |Of|
+                                \        /     \        /        \           /
+            Component 2:        |Fi|S |Of|..... Fi|S |Of|........ C |Fi|S |Of|
+              time              |t0|t1         |t2|t3               |t4|t5
+
+
+.. rubric:: Micro component with time integration and intermediate snapshots
+
+Let's see what happens when we replace Component 2 by Component 3, which does
+time integration and implements intermediate snapshots.
+
+
+.. code-block:: text
+    :caption: Example run, also showing simulation times in the components.
+
+      time        |t0                  |t4                  |t8
+    Component 1:  |Fi|Oi|.............. S |Oi|.............. S |Oi|........
+                        \              /     \              /     \
+    Component 3:        |Fi|S |S |S |Of|..... Fi|S |S |S |Of|..... Fi|S |S
+      time              |t0|t1|t2|t3         |t4|t5|t6|t7         |t8|t9|t10
+
+For the same reasons as with Component 2, :math:`t_i \leq t_{i+1}` for
+:math:`i=0,1,...`.
+
+Now, Component 3 can make intermediate snapshots between each ``S``, but also
+final snapshots. Let's see what effect that has for different checkpoint times:
+
+.. tabs::
+
+    .. tab:: :math:`t_c \leq t_1`
+
+        In this case, both components will take a snapshot at the first possible
+        moment: right after their first ``S`` block.
+
+        .. rubric:: Consistency
+
+        Now the snapshots have different message counts. For the ``O_I ->
+        F_INIT`` conduit both components see 1 message sent/received. For the
+        other conduit, however, Component 1 already received a message that is
+        not sent in Component 3's snapshot.
+
+        When resuming, Component 3 resumes in its state update loop and sends a
+        message back to Component 1 during ``O_F``. This message is discarded by
+        Component 1. From that point, the simulation can resume as usual.
+
+        .. code-block:: text
+
+              time        |t0                     |t4                     |t8
+            Component 1:  |Fi|Oi|................. S |C |Oi|.............. S |Oi|........
+                                \                 /        \              /     \
+            Component 3:        |Fi|S |C |S |S |Of|........ Fi|S |S |S |Of|..... Fi|S |S
+              time              |t0|t1   |t2|t3            |t4|t5|t6|t7         |t8|t9|t10
+
+    .. tab:: :math:`t_1 < t_c \leq t_2`
+
+        This is quite similar to the previous case. The difference is that
+        Component 3 takes its snapshot after the second ``S`` block.
+
+        .. code-block:: text
+
+              time        |t0                     |t4                     |t8
+            Component 1:  |Fi|Oi|................. S |C |Oi|.............. S |Oi|........
+                                \                 /        \              /     \
+            Component 3:        |Fi|S |S |C |S |Of|........ Fi|S |S |S |Of|..... Fi|S |S
+              time              |t0|t1|t2   |t3            |t4|t5|t6|t7         |t8|t9|t10
+
+    .. tab:: :math:`t_3 < t_c \leq t_4`
+
+        The checkpoint for Component 1 does not change. However, in this case
+        Component 3 takes a :ref:`final snapshot <Final snapshots>` instead of
+        an :ref:`intermediate snapshot <Intermediate snapshots>`.
+
+        .. rubric:: Consistency
+
+        Both snapshots have the same message counts: 1 message sent/received
+        per conduit. When resuming, Component 1 starts by sending a new message
+        on its ``O_I`` port, and Component 2 runs ``F_INIT`` as usual.
+
+        .. code-block:: text
+
+              time        |t0                  |t4                        |t8
+            Component 1:  |Fi|Oi|.............. S |C |Oi|................. S |Oi|........
+                                \              /        \                 /     \
+            Component 3:        |Fi|S |S |S |Of|........ C |Fi|S |S |S |Of|..... Fi|S |S
+              time              |t0|t1|t2|t3               |t4|t5|t6|t7         |t8|t9|t10
+
+
+Interact coupling
+'''''''''''''''''
+
+In this section we will look at the interact coupling mode. This example
+simulation consists of two components: Component 1 and Component 2. They are
+coupled as follows:
+
+-   The ``O_I`` port of Component 1 is connected to the ``S`` port of
+    Component 2.
+-   The ``O_I`` port of Component 2 is connected to the ``S`` port of
+    Component 1.
+
+.. code-block:: text
+    :caption: Example lock-step interact run for three iterations.
+
+      time        |t0   |t1   |t2   |t3
+    Component 1:  |Fi|Oi|S |Oi|S |Oi|S |Of|
+                        X     X     X
+    Component 2:  |Fi|Oi|S |Oi|S |Oi|S |Of|
+      time        |t0   |t1   |t2   |t3
+
+Let's see what happens for different checkpoint times:
+
+.. tabs::
+
+    .. tab:: :math:`t_c \leq t_1`
+
+        In this case, both components make a snapshot After the first ``S``
+        block.
+
+        .. rubric:: Consistency
+
+        Both snapshots have the same message counts: 1 message sent/received
+        per conduit. When resuming, both components send the next message at
+        ``O_I`` and continue with their ``S``.
+
+        .. code-block:: text
+
+                  time        |t0   |t1      |t2   |t3
+                Component 1:  |Fi|Oi|S |C |Oi|S |Oi|S |Of|
+                                    X        X     X
+                Component 2:  |Fi|Oi|S |C |Oi|S |Oi|S |Of|
+                  time        |t0   |t1      |t2   |t3
+
+    .. tab:: :math:`t_1 < t_c \leq t_2`
+
+        This is almost the same as on the previous tab, just at a later point in
+        the run.
+
+        .. code-block:: text
+
+                  time        |t0   |t1   |t2      |t3
+                Component 1:  |Fi|Oi|S |Oi|S |C |Oi|S |Of|
+                                    X     X        X
+                Component 2:  |Fi|Oi|S |Oi|S |C |Oi|S |Of|
+                  time        |t0   |t1   |t2      |t3
+
+If the two components do not use the same time step, a scale bridge is required
+to interpolate. See ``docs/source/examples/python/interact_coupling.py`` for an
+implementation of such a component. The timeline becomes a bit more complicated
+now:
+
+.. code-block:: text
+    :caption: Example interact run. Component 1 has a smaller time step than Component 2.
+
+      time        |t0            |t1                     |t2         |t4
+    Component 1:  |Fi|Oi|........ S |Oi|................. S |Oi|..... S |Oi|...........
+                        \        /     \                 /     \     /     \
+    Scale bridge:       |S |S |Oi|..... S |Oi|..... S |Oi|..... S |Oi|..... S |Oi|.....
+                           /                 \     /                             \
+    Component 2:     |Fi|Oi|................. S |Oi|............................. S |Oi
+      time           |t0                     |t3                                 |t5
+
+Let's see what happens for different checkpoint times:
+
+.. tabs::
+
+    .. tab:: :math:`t_c \leq t_0`
+
+        In this case, both components make a snapshot after the first ``S``
+        block. The scale bridge creates a snapshot after the first two ``S`` are
+        complete.
+
+        .. rubric:: Consistency
+
+        Both component snapshots have received one more message on ``S`` than
+        the scale bridge has sent. This is no problem: when resuming, the scale
+        bridge will send the messages again, but those are discarded by both
+        components.
+
+        .. code-block:: text
+
+              time        |t0               |t1                           |t2         |t4
+            Component 1:  |Fi|Oi|........... S |C |Oi|.................... S |Oi|..... S |Oi|...........
+                                \           /        \                    /     \     /     \
+            Scale bridge:       |S |S |C |Oi|........ S |Oi|........ S |Oi|..... S |Oi|..... S |Oi|.....
+                                /                          \        /                             \
+            Component 2:     |Fi|Oi|....................... S |C |Oi|............................. S |Oi
+              time           |t0                           |t3                                    |t5
+
+    .. tab:: :math:`t_0 < t_c \leq t_1`
+
+        In this case, both components make a snapshot after the first ``S``
+        block. The scale bridge creates a snapshot after receiving the second
+        message from Component 2.
+
+        .. rubric:: Consistency
+
+        In this case, the scale bridge has received one more message on its
+        ``S`` port at its checkpoint moment, than the components have sent at
+        their checkpoints. Again, this is no problem: the components send their
+        messages again when resuming, but these are discarded by the scale
+        bridge.
+
+        .. code-block:: text
+
+              time        |t0            |t1                              |t2         |t4
+            Component 1:  |Fi|Oi|........ S |C |Oi|....................... S |Oi|..... S |Oi|...........
+                                \        /        \                       /     \     /     \
+            Scale bridge:       |S |S |Oi|........ S |Oi|........ S |C |Oi|..... S |Oi|..... S |Oi|.....
+                                /                       \        /                                \
+            Component 2:     |Fi|Oi|.................... S |C |Oi|................................ S |Oi
+              time           |t0                        |t3                                       |t5
+
+    .. tab:: :math:`t_1 < t_c \leq t_2`
+
+        Now component 1 takes a snapshot after its second ``S`` phase. Component
+        still takes a snapshot after its first ``S`` phase. The scale bridge
+        checkpoints after receiving the third message from Component 1.
+
+        .. rubric:: Consistency
+
+        Again, the scale bridge has received one more message on its
+        ``S`` port at its checkpoint moment, than the components have sent at
+        their checkpoints. Again, this is no problem: the components send their
+        messages again when resuming, but these are discarded by the scale
+        bridge.
+
+        .. code-block:: text
+
+              time        |t0            |t1                        |t2               |t4
+            Component 1:  |Fi|Oi|........ S |Oi|.................... S |C |Oi|......... S |Oi|...........
+                                \        /     \                    /        \        /     \
+            Scale bridge:       |S |S |Oi|..... S |Oi|........ S |Oi|........ S |C |Oi|..... S |Oi|.....
+                                /                    \        /                                   \
+            Component 2:     |Fi|Oi|................. S |C |Oi|................................... S |Oi
+              time           |t0                     |t3                                          |t5
+
+
+Dispatch coupling
+'''''''''''''''''
+
+Finally, we take a look at two component coupled in dispatch:
+
+-   The ``O_F`` port of Component 1 is connected to the ``F_INIT`` port of
+    Component 2.
+
+This leads to the following timeline:
+
+.. code-block:: text
+    :caption: Example lock-step interact run for three iterations.
+
+      time        |t0|t1|t2|t3
+    Component 1:  |Fi|S |S |S |Of|
+                                 \
+    Component 2:                 |Fi|S |S |S |Of|
+      time                       |t3|t4|t5|t6
+
+.. tabs::
+
+    .. tab:: :math:`t_c \leq t_1`
+
+        In this case, both components make a snapshot after the first ``S``
+        block.
+
+        .. rubric:: Consistency
+
+        The snapshot of Component 1 can be combined with the snapshot of
+        Component 2, but then all remaining work of Component 1 will be ignored
+        by Component 2. It is also possible to restart Component 2 from scratch
+        (this is also consistent).
+
+        .. code-block:: text
+
+              time        |t0|t1   |t2|t3
+            Component 1:  |Fi|S |C |S |S |Of|
+                                            \
+            Component 2:                    |Fi|S |C |S |S |Of|
+              time                          |t3|t4|   t5|t6
+
+    .. tab:: :math:`t_1 < t_c \leq t_2`
+
+        This is similar to the previous tab. However, Component 1 takes a
+        snapshot at a later point.
+
+
+    .. tab:: :math:`t_3 < t_c \leq t_4`
+
+        In this case, Component 1 does not take a snapshot, unless either:
+
+        1.  A :ref:`checkpoint rule is defined <Defining checkpoints>` for
+            ``at_end``, or
+        2.  Component 1 is executed again (for example, when this is a
+            sub-workflow in a call/release coupling) and a final snapshot is
+            triggered.
+
+        .. rubric:: Consistency
+
+        When a final snapshot is taken by Component 1, it will be consistent
+        with any checkpoint taken during the exeuction of Component 2 and we can
+        restart the workflow.
+
+        .. code-block:: text
+
+              time        |t0|t1|t2|t3
+            Component 1:  |Fi|S |S |S |Of|C?
+                                         \
+            Component 2:                 |Fi|S |C |S |S |Of|
+              time                       |t3|t4|   t5|t6
+
+
+(In)consistency for wallclock time checkpoints
+``````````````````````````````````````````````
+
+In the current implementation, wallclock time checkpoints are taken as soon as
+possible after exceeding a certain wallclock time. Let's look at an example
+where this is not leading to consistent workflow snapshots.
+
+This example is similar to the :ref:`Interact coupling` example seen previously.
+
+-   The ``O_I`` port of Component 1 is connected to the ``S`` port of
+    Component 2.
+-   The ``O_I`` port of Component 2 is connected to the ``S`` port of
+    Component 1.
+
+However, let's now look at the wallclock time and assume that Component 1's
+``S`` Operator takes longer than Component 2's, compute time indicated by
+``~~``:
+
+.. code-block:: text
+
+    Wallclock time:         |w1|w2    |w3|w4
+    Component 1:  |Fi|Oi|.S ~~~|Oi|.S ~~~|Oi|.S ~~~|Of|
+                        \/      __\/      __\/
+                        /\     /   \     /   \
+    Component 2:  |Fi|Oi|.S |Oi|... S |Oi|... S |Of|
+
+Because Component 1 spends more time in ``S``, Component 2 is waiting in each
+following iteration of ``S``. Let's see what happens for different wallclock
+time checkpoint moments :math:`w_c`:
+
+.. tabs::
+
+    .. tab:: :math:`w_c \leq w_1`
+
+        In this case, both components make a snapshot after the first ``S``
+        block.
+
+        .. rubric:: Consistency
+
+        At the moment of snapshot, both components have the same number of
+        messages sent/received on their conduits. This is consistent.
+
+        .. code-block:: text
+
+            Wallclock time:         |w1|w2       |w3|w4
+            Component 1:  |Fi|Oi|.S ~~~|C |Oi|.S ~~~|Oi|.S ~~~|Of|
+                                \/         __\/      __\/
+                                /\        /   \     /   \
+            Component 2:  |Fi|Oi|.S |C |Oi|... S |Oi|... S |Of|
+
+    .. tab:: :math:`w_1 < w_c \leq w_2`
+
+        Component 1 takes a snapshot after the first ``S`` block, but Component
+        2 after its second ``S`` block.
+
+        .. rubric:: Consistency
+
+        The created snapshots are not consistent: Component 2 has sent 1 more
+        message than Component 1 has received. When resuming Component 1 would
+        wait for a message that never comes again, so this is not a valid resume
+        point.
+
+        .. code-block:: text
+
+            Wallclock time:         |w1|w2       |w3|w4
+            Component 1:  |Fi|Oi|.S ~~~|C |Oi|.S ~~~|Oi|.S ~~~|Of|
+                                \/      _____\/        \/
+                                /\     /      \        /\
+            Component 2:  |Fi|Oi|.S |Oi|...... S |C |Oi|. S |Of|
+
+As you can see, the second scenario does not lead to consistent checkpoints.
+

From c77b72d978ba2e6020c06a3f62c6d0aa9162883a Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 9 Dec 2022 15:21:15 +0100
Subject: [PATCH 147/183] Pin tox version in RTD config

---
 .readthedocs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.readthedocs.yml b/.readthedocs.yml
index 2a63fe5e..163dfbaa 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -8,7 +8,7 @@ build:
     python: "3.8"
   jobs:
     post_create_environment:
-      - pip install tox
+      - pip install tox<4
       # let tox set up a build environment
       - tox -e docs --notest
       # export tox installed packages as requirements for the install step

From cd2abf82fe7d36ededcbb8c444106c0ed47095ea Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 9 Dec 2022 15:22:11 +0100
Subject: [PATCH 148/183] Proper quoting in RTD config

---
 .readthedocs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.readthedocs.yml b/.readthedocs.yml
index 163dfbaa..d3cdc812 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -8,7 +8,7 @@ build:
     python: "3.8"
   jobs:
     post_create_environment:
-      - pip install tox<4
+      - pip install 'tox<4'
       # let tox set up a build environment
       - tox -e docs --notest
       # export tox installed packages as requirements for the install step

From 9cd6fbbe288cf595277fa975472f4803ff815515 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 9 Dec 2022 15:23:54 +0100
Subject: [PATCH 149/183] Quotes don't work, using backslash in RTD config

---
 .readthedocs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.readthedocs.yml b/.readthedocs.yml
index d3cdc812..084ce69a 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -8,7 +8,7 @@ build:
     python: "3.8"
   jobs:
     post_create_environment:
-      - pip install 'tox<4'
+      - pip install tox\<4
       # let tox set up a build environment
       - tox -e docs --notest
       # export tox installed packages as requirements for the install step

From daac74503e5c117c47ec5b6215011e5354904c59 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 13 Jan 2023 13:46:59 +0100
Subject: [PATCH 150/183] Update checkpointing documentation

---
 docs/source/checkpointing.rst                 | 103 ++++++++++++------
 .../python/checkpointing_diffusion.py         |  10 +-
 .../examples/python/checkpointing_reaction.py |  11 +-
 ...n.py => reaction_no_state_for_next_use.py} |   6 +-
 .../checkpointing_diffusion_partial.py        |  10 +-
 .../checkpointing_instance_partial.py         |  10 +-
 .../checkpointing_reaction_partial.py         |  11 +-
 7 files changed, 102 insertions(+), 59 deletions(-)
 rename docs/source/examples/python/{stateless_reaction.py => reaction_no_state_for_next_use.py} (86%)

diff --git a/docs/source/checkpointing.rst b/docs/source/checkpointing.rst
index 355d404d..ce6a1a0c 100644
--- a/docs/source/checkpointing.rst
+++ b/docs/source/checkpointing.rst
@@ -3,7 +3,7 @@ Simulation checkpoints
 
 When you execute a long-running simulation, it can be very helpful to store the
 state of a simulation at certain intervals. For example, your simulation running
-on a HPC cluster may crash, just before it's finished, due to insufficient
+on a HPC cluster may crash due to insufficient
 memory available. Instead of restarting this simulation from scratch, you could
 restart it -- with an increased memory allocation -- from a checkpoint, which
 would save a lot of compute time!
@@ -13,6 +13,14 @@ comes with built-in checkpointing support. This page describes in detail how to
 use the MUSCLE3 checkpointing API, how to specify checkpoints in the workflow
 configuration and how to resume a workflow.
 
+.. warning::
+
+    Checkpointing in MUSCLE3 version 0.6.0 is still in development: the API may
+    change in a future MUSCLE3 release.
+
+    Checkpointing is only available in the Python API. C++ and Fortran support
+    is planned for version 0.7.0.
+
 In the :ref:`user tutorial`, you can read about the checkpointing concepts and
 how to use the API when running and resuming MUSCLE3 simulations. This is
 followed by a :ref:`developer tutorial`, which explains how to add checkpointing
@@ -272,14 +280,20 @@ initialization of ``muscle_manager``.
     in the simulation will create a snapshot when requested, there is no
     guarantee that all snapshots are :ref:`consistent <Snapshot consistency>`.
 
-    When a simulation has relatively simple coupling between components, i.e.
-    only one peer instance per :external:py:class:`~ymmsl.Operator`,
+    When a simulation has relatively simple coupling between components,
     checkpointing based on wallclock time usually works fine.
 
     However for co-simulation (the *interact* coupling type) and more complex
     coupling, it is likely that not all checkpoints lead to a consistent
     :term:`workflow snapshot`.
 
+    If you intend to use wallclock time checkpoints and find that you often
+    don't get a consistent workflow snapshot, you may try the following
+    workaround: instead of requesting a wallclock time checkpoint at (for
+    example) 600 seconds, you can specify checkpoints at 600, 601, 602, 603, 604
+    and 605 seconds. The "right" interval to use will depend on the typical
+    compute times of your components and coupling in the simulation.
+
 
 Running a simulation with checkpoints
 `````````````````````````````````````
@@ -443,7 +457,6 @@ Resuming from *at_end* snapshots
 
 .. warning::
     Resuming from only ``at_end`` snapshot will immediately complete.
-    TODO: Need to think about this still.
 
 
 Snapshot consistency
@@ -636,7 +649,46 @@ not complete and cannot be executed.
                     :language: fortran
 
 
-Step 1: implement checkpoint hooks
+Step 1: Set ``USES_CHECKPOINT_API`` on instance creation
+````````````````````````````````````````````````````````
+
+As first step, you need to indicate that you intend to use the checkpoint API.
+You do this through the :attr:`~InstanceFlags.USES_CHECKPOINT_API` flag when
+creating the instance:
+
+.. tabs::
+
+    .. group-tab:: Python
+
+        .. code-block:: Python
+
+            from libmuscle import Instance, USES_CHECKPOINT_API
+
+            ...
+
+            ports = ...
+            instance = Instance(ports, USES_CHECKPOINT_API)
+
+    ..
+        group-tab:: C++
+
+        .. code-block:: C++
+
+            TODO
+
+    ..
+        group-tab:: Fortran
+
+        .. code-block:: Fortran
+
+            TODO
+
+
+If you do not set this flag, you'll get a runtime error when trying to use any
+of the checkpointing API calls on the Instance object.
+
+
+Step 2: Implement checkpoint hooks
 ``````````````````````````````````
 
 The first step in implementing the checkpointing API is implementing the
@@ -862,7 +914,7 @@ full contents of the files in the git repository.
                 TODO
 
 
-Step 2: implement resume
+Step 3: Implement resume
 ````````````````````````
 
 Now that the checkpoint hooks are implemented, we can add support for resuming
@@ -904,6 +956,7 @@ implementations in the reaction-diffusion models and the component template.
     :py:meth:`~libmuscle.Instance.load_snapshot`,
     :py:meth:`~libmuscle.Instance.should_init`.
 
+
 Reload settings when resuming
 '''''''''''''''''''''''''''''
 
@@ -919,7 +972,7 @@ the documentation, such that users of your component know what they can expect.
 
 
 Example: implemented checkpoint hooks and resume
-''''''''''''''''''''''''''''''''''''''''''''''''
+````````````````````````````````````````````````
 
 Note that below examples only shows the changes compared to the :ref:`start
 situation <Start situation: components without checkpointing>`. You can view the
@@ -1028,35 +1081,29 @@ full contents of the files in the git repository.
                 TODO
 
 
-Stateless components
-````````````````````
+Components that do not keep state between reuse
+```````````````````````````````````````````````
 
-Some components do not need to store data between reuses. An example of that is
+Some components do not need to keep state between reuses. An example of that is
 the reaction model from above examples. In the final snapshot, no state needs
 to be stored to allow properly resuming this component, see
-:ref:`Example: implemented checkpoint hooks`. These components are called
-stateless.
+:ref:`Example: implemented checkpoint hooks`.
 
-.. note::
-
-    *Stateless* in this context means that the component does not store any
-    state between iterations of the reuse loop. Although there is a state inside
-    the loop (e.g. ``cur_t`` and ``U``), the component is still considered
-    *stateless*.
-
-Other examples of stateless components may be data transformers, receiving data
+Other examples of such components may be data transformers, receiving data
 on an ``F_INIT`` port and sending the converted data on an ``O_F`` port.
 
-If you indicate to libmuscle that these components are *stateless*, libmuscle
-automatically provides checkpointing for your component. See below example for
-a stateless variant of the example reaction model.
+If you indicate to libmuscle that your component does not keep state between
+reuse, libmuscle automatically provides checkpointing for your component. You do
+this by providing the :attr:`~InstanceFlags.KEEPS_NO_STATE_FOR_NEXT_USE` flag
+when creating the instance. See below example for a variant of the example
+reaction model.
 
 .. tabs::
 
     .. group-tab:: Python
 
-        .. literalinclude:: examples/python/stateless_reaction.py
-            :caption: ``docs/source/examples/python/stateless_reaction.py``
+        .. literalinclude:: examples/python/reaction_no_state_for_next_use.py
+            :caption: ``docs/source/examples/python/reaction_no_state_for_next_use.py``
             :language: python
             :diff: examples/python/reaction.py
 
@@ -1092,12 +1139,6 @@ issues with the checkpointing implementation. When MUSCLE3 detects a problem, an
 error is raised to indicate what went wrong and point you in the right direction
 for fixing the problem.
 
-Some of these errors can be turned into a warning by setting the environment
-variable ``MUSCLE_DISABLE_CHECKPOINT_VALIDATION`` to any value. Note that it is
-discouraged to use this: the errors are there to ensure proper exeuction and
-resuming. Not adhering to the rules may lead to deadlocks or crashes of your
-simulations.
-
 ..
     TODO: create an overview of the validation rules
 
diff --git a/docs/source/examples/python/checkpointing_diffusion.py b/docs/source/examples/python/checkpointing_diffusion.py
index 2a676fa2..9595c289 100644
--- a/docs/source/examples/python/checkpointing_diffusion.py
+++ b/docs/source/examples/python/checkpointing_diffusion.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 
-from libmuscle import Grid, Instance, Message
+from libmuscle import Grid, Instance, Message, USES_CHECKPOINT_API
 from ymmsl import Operator
 
 
@@ -34,7 +34,7 @@ def diffusion() -> None:
     instance = Instance({
             Operator.O_I: ['state_out'],
             Operator.S: ['state_in'],
-            Operator.O_F: ['final_state_out']})
+            Operator.O_F: ['final_state_out']}, USES_CHECKPOINT_API)
 
     while instance.reuse_instance():
         # F_INIT
@@ -82,11 +82,11 @@ def diffusion() -> None:
             t_cur += dt
 
             if instance.should_save_snapshot(t_cur):
-                msg = Message(t_cur, None, Grid(Us))
+                msg = Message(t_cur, data=Grid(Us))
                 instance.save_snapshot(msg)
 
         # O_F
-        final_state_msg = Message(t_cur, None, Grid(U, ['x']))
+        final_state_msg = Message(t_cur, data=Grid(U, ['x']))
         instance.send('final_state_out', final_state_msg)
 
         if 'DONTPLOT' not in os.environ and 'SLURM_NODENAME' not in os.environ:
@@ -109,7 +109,7 @@ def diffusion() -> None:
             plt.show()
 
         if instance.should_save_final_snapshot():
-            msg = Message(t_cur, None, Grid(Us))
+            msg = Message(t_cur, data=Grid(Us))
             instance.save_final_snapshot(msg)
 
 
diff --git a/docs/source/examples/python/checkpointing_reaction.py b/docs/source/examples/python/checkpointing_reaction.py
index 6e43db30..fbfb1d74 100644
--- a/docs/source/examples/python/checkpointing_reaction.py
+++ b/docs/source/examples/python/checkpointing_reaction.py
@@ -1,6 +1,6 @@
 import logging
 
-from libmuscle import Grid, Instance, Message
+from libmuscle import Grid, Instance, Message, USES_CHECKPOINT_API
 from ymmsl import Operator
 
 
@@ -9,7 +9,8 @@ def reaction() -> None:
     """
     instance = Instance({
             Operator.F_INIT: ['initial_state'],     # list of float
-            Operator.O_F: ['final_state']})         # list of float
+            Operator.O_F: ['final_state']},         # list of float
+            USES_CHECKPOINT_API)
 
     while instance.reuse_instance():
         t_max = instance.get_setting('t_max', 'float')
@@ -40,14 +41,14 @@ def reaction() -> None:
             t_cur += dt
 
             if instance.should_save_snapshot(t_cur):
-                msg = Message(t_cur, None, [Grid(U, ['x']), t_stop])
+                msg = Message(t_cur, data=[Grid(U, ['x']), t_stop])
                 instance.save_snapshot(msg)
 
         # O_F
-        instance.send('final_state', Message(t_cur, None, Grid(U, ['x'])))
+        instance.send('final_state', Message(t_cur, data=Grid(U, ['x'])))
 
         if instance.should_save_final_snapshot():
-            msg = Message(t_cur, None, None)
+            msg = Message(t_cur)
             instance.save_final_snapshot(msg)
 
 
diff --git a/docs/source/examples/python/stateless_reaction.py b/docs/source/examples/python/reaction_no_state_for_next_use.py
similarity index 86%
rename from docs/source/examples/python/stateless_reaction.py
rename to docs/source/examples/python/reaction_no_state_for_next_use.py
index 9a3f1b1b..e0cb4dca 100644
--- a/docs/source/examples/python/stateless_reaction.py
+++ b/docs/source/examples/python/reaction_no_state_for_next_use.py
@@ -1,7 +1,7 @@
 import logging
 
-from libmuscle import Grid, Instance, Message
-from ymmsl import Operator, ImplementationState
+from libmuscle import Grid, Instance, Message, KEEPS_NO_STATE_FOR_NEXT_USE
+from ymmsl import Operator
 
 
 def reaction() -> None:
@@ -10,7 +10,7 @@ def reaction() -> None:
     instance = Instance({
             Operator.F_INIT: ['initial_state'],     # list of float
             Operator.O_F: ['final_state']},         # list of float
-            stateful=ImplementationState.STATELESS)
+            KEEPS_NO_STATE_FOR_NEXT_USE)
 
     while instance.reuse_instance():
         # F_INIT
diff --git a/docs/source/tutorial_code/checkpointing_diffusion_partial.py b/docs/source/tutorial_code/checkpointing_diffusion_partial.py
index 9b622375..3e588b42 100644
--- a/docs/source/tutorial_code/checkpointing_diffusion_partial.py
+++ b/docs/source/tutorial_code/checkpointing_diffusion_partial.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 
-from libmuscle import Grid, Instance, Message
+from libmuscle import Grid, Instance, Message, USES_CHECKPOINT_API
 from ymmsl import Operator
 
 
@@ -34,7 +34,7 @@ def diffusion() -> None:
     instance = Instance({
             Operator.O_I: ['state_out'],
             Operator.S: ['state_in'],
-            Operator.O_F: ['final_state_out']})
+            Operator.O_F: ['final_state_out']}, USES_CHECKPOINT_API)
 
     while instance.reuse_instance():
         # F_INIT
@@ -75,11 +75,11 @@ def diffusion() -> None:
             t_cur += dt
 
             if instance.should_save_snapshot(t_cur):
-                msg = Message(t_cur, None, Grid(Us))
+                msg = Message(t_cur, data=Grid(Us))
                 instance.save_snapshot(msg)
 
         # O_F
-        final_state_msg = Message(t_cur, None, Grid(U, ['x']))
+        final_state_msg = Message(t_cur, data=Grid(U, ['x']))
         instance.send('final_state_out', final_state_msg)
 
         if 'DONTPLOT' not in os.environ and 'SLURM_NODENAME' not in os.environ:
@@ -102,7 +102,7 @@ def diffusion() -> None:
             plt.show()
 
         if instance.should_save_final_snapshot():
-            msg = Message(t_cur, None, Grid(Us))
+            msg = Message(t_cur, data=Grid(Us))
             instance.save_final_snapshot(msg)
 
 
diff --git a/docs/source/tutorial_code/checkpointing_instance_partial.py b/docs/source/tutorial_code/checkpointing_instance_partial.py
index 31c9d550..2aefb080 100644
--- a/docs/source/tutorial_code/checkpointing_instance_partial.py
+++ b/docs/source/tutorial_code/checkpointing_instance_partial.py
@@ -3,14 +3,14 @@
 Note that this template is not executable as is, please have a look at the
 examples in ``docs/source/examples`` to see working components."""
 
-from libmuscle import Instance, Message
+from libmuscle import Instance, Message, USES_CHECKPOINT_API
 from ymmsl import Operator
 
 instance = Instance({
         Operator.F_INIT: ["F_INIT_Port"],
         Operator.O_I: ["O_I_Port"],
         Operator.S: ["S_Port"],
-        Operator.O_F: ["O_F_Port"]})
+        Operator.O_F: ["O_F_Port"]}, USES_CHECKPOINT_API)
 
 while instance.reuse_instance():
     # F_INIT
@@ -37,13 +37,13 @@
 
         if instance.should_save_snapshot(t_cur):
             state = ...  # component implementation
-            msg = Message(t_cur, None, state)
+            msg = Message(t_cur, data=state)
             instance.save_snapshot(msg)
 
     # O_F
-    instance.send("O_F_Port", Message(t_cur, None, data))
+    instance.send("O_F_Port", Message(t_cur, data=data))
 
     if instance.should_save_final_snapshot():
         state = ...  # component implementation
-        msg = Message(t_cur, None, state)
+        msg = Message(t_cur, data=state)
         instance.save_final_snapshot(msg)
diff --git a/docs/source/tutorial_code/checkpointing_reaction_partial.py b/docs/source/tutorial_code/checkpointing_reaction_partial.py
index efe4f629..49c706d5 100644
--- a/docs/source/tutorial_code/checkpointing_reaction_partial.py
+++ b/docs/source/tutorial_code/checkpointing_reaction_partial.py
@@ -1,6 +1,6 @@
 import logging
 
-from libmuscle import Grid, Instance, Message
+from libmuscle import Grid, Instance, Message, USES_CHECKPOINT_API
 from ymmsl import Operator
 
 
@@ -9,7 +9,8 @@ def reaction() -> None:
     """
     instance = Instance({
             Operator.F_INIT: ['initial_state'],     # list of float
-            Operator.O_F: ['final_state']})         # list of float
+            Operator.O_F: ['final_state']},         # list of float
+            USES_CHECKPOINT_API)
 
     while instance.reuse_instance():
         # F_INIT
@@ -30,14 +31,14 @@ def reaction() -> None:
             t_cur += dt
 
             if instance.should_save_snapshot(t_cur):
-                msg = Message(t_cur, None, [Grid(U, ['x']), t_stop])
+                msg = Message(t_cur, data=[Grid(U, ['x']), t_stop])
                 instance.save_snapshot(msg)
 
         # O_F
-        instance.send('final_state', Message(t_cur, None, Grid(U, ['x'])))
+        instance.send('final_state', Message(t_cur, data=Grid(U, ['x'])))
 
         if instance.should_save_final_snapshot():
-            msg = Message(t_cur, None, None)
+            msg = Message(t_cur)
             instance.save_final_snapshot(msg)
 
 

From b31fccdf0a30ecd55df76ef30c0f2d1483bc604d Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 13 Jan 2023 14:13:44 +0100
Subject: [PATCH 151/183] Add warning when using checkpointing API

---
 libmuscle/python/libmuscle/instance.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index a1e13ec4..fae7386a 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -108,6 +108,11 @@ def __init__(
         self.__is_shut_down = False
 
         self._flags = InstanceFlags(flags)
+        if InstanceFlags.USES_CHECKPOINT_API in self._flags:
+            warnings.warn(
+                    'Checkpointing in MUSCLE3 version 0.6.0 is still in'
+                    ' development: the API may change in a future MUSCLE3'
+                    ' release.')
 
         # Note that these are accessed by Muscle3, but otherwise private.
         self._name, self._index = self.__make_full_name()

From 1eb7580f4a615e818aad6c9633c2098d9e7a1d54 Mon Sep 17 00:00:00 2001
From: Maarten Sebregts <msebregts@ignitioncomputing.com>
Date: Fri, 13 Jan 2023 14:41:24 +0100
Subject: [PATCH 152/183] Error when checkpoints requested but not supported

---
 libmuscle/python/libmuscle/instance.py        | 13 +++++++++++
 .../python/libmuscle/test/test_instance.py    | 23 ++++++++++++++++++-
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py
index fae7386a..0a906a39 100644
--- a/libmuscle/python/libmuscle/instance.py
+++ b/libmuscle/python/libmuscle/instance.py
@@ -87,6 +87,12 @@ class InstanceFlags(Flag):
     """
 
 
+_CHECKPOINT_SUPPORT_MASK = (
+        InstanceFlags.USES_CHECKPOINT_API |
+        InstanceFlags.KEEPS_NO_STATE_FOR_NEXT_USE |
+        InstanceFlags.STATE_NOT_REQUIRED_FOR_NEXT_USE)
+
+
 class Instance:
     """Represents a component instance in a MUSCLE3 simulation.
 
@@ -178,6 +184,13 @@ def __init__(
         elapsed_time, checkpoints = checkpoint_info[0:2]
         self._trigger_manager.set_checkpoint_info(elapsed_time, checkpoints)
 
+        if checkpoints and not (self._flags & _CHECKPOINT_SUPPORT_MASK):
+            raise RuntimeError(
+                    'The workflow has requested checkpoints, but this instance'
+                    ' does not support checkpointing. Please consult the'
+                    ' MUSCLE3 checkpointing documentation how to add'
+                    ' checkpointing support.')
+
         resume_snapshot, snapshot_dir = checkpoint_info[2:4]
         saved_at = self._snapshot_manager.prepare_resume(
                 resume_snapshot, snapshot_dir)
diff --git a/libmuscle/python/libmuscle/test/test_instance.py b/libmuscle/python/libmuscle/test/test_instance.py
index 7c8b1be4..53a3b302 100644
--- a/libmuscle/python/libmuscle/test/test_instance.py
+++ b/libmuscle/python/libmuscle/test/test_instance.py
@@ -1,3 +1,4 @@
+from contextlib import nullcontext as does_not_raise
 import sys
 from typing import Generator
 from unittest.mock import MagicMock, patch
@@ -6,7 +7,7 @@
 from ymmsl import Operator, Reference, Settings, Checkpoints
 
 from libmuscle.communicator import Message
-from libmuscle.instance import Instance
+from libmuscle.instance import Instance, InstanceFlags as IFlags
 from libmuscle.mpp_message import ClosePort
 from libmuscle.settings_manager import SettingsManager
 
@@ -320,3 +321,23 @@ def test_reuse_instance_no_f_init_ports(instance):
 def test_reuse_instance_miswired(instance):
     with pytest.raises(RuntimeError):
         instance.reuse_instance()
+
+
+@pytest.mark.parametrize('flags, expectation', [
+        (IFlags(0), pytest.raises(RuntimeError)),
+        (IFlags.USES_CHECKPOINT_API, does_not_raise()),
+        (IFlags.KEEPS_NO_STATE_FOR_NEXT_USE, does_not_raise()),
+        (IFlags.STATE_NOT_REQUIRED_FOR_NEXT_USE, does_not_raise())])
+def test_checkpoint_support(sys_argv_instance, tmp_path, flags, expectation):
+    with patch('libmuscle.instance.MMPClient') as mmp_client, \
+         patch('libmuscle.instance.Communicator') as comm_type:
+        comm_type.return_value = MagicMock()
+
+        mmp_client_object = MagicMock()
+        mmp_client_object.request_peers.return_value = (None, None, None)
+        checkpoint_info = (0.0, Checkpoints(at_end=True), None, tmp_path)
+        mmp_client_object.get_checkpoint_info.return_value = checkpoint_info
+        mmp_client.return_value = mmp_client_object
+
+        with expectation:
+            Instance(flags=flags)

From db2186530da02d5ebf5d96b3f2ee582f6899ff07 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Mon, 16 Jan 2023 20:11:27 +0100
Subject: [PATCH 153/183] Fix small language issues in checkpointing docs

---
 docs/source/checkpointing.rst | 54 ++++++++++++++++++-----------------
 1 file changed, 28 insertions(+), 26 deletions(-)

diff --git a/docs/source/checkpointing.rst b/docs/source/checkpointing.rst
index ce6a1a0c..a019e0a1 100644
--- a/docs/source/checkpointing.rst
+++ b/docs/source/checkpointing.rst
@@ -63,7 +63,7 @@ and resuming simulations. Some details are deliberately left out, though you
 can read all about those in the :ref:`developer tutorial` or :ref:`checkpointing
 deep-dive`.
 
-.. contents:: User totorial contents
+.. contents:: User tutorial contents
     :local:
 
 
@@ -106,7 +106,7 @@ rules to set checkpoint moments for these:
 
 .. _at checkpoint rule:
 
-#. ``at`` rules define specific moments. The example rule above request a
+#. ``at`` rules select specific moments. The example rule above request a
    checkpoint to be taken at 300, 600 and 1800 seconds after the start of the
    simulation. You can define multiple times in one ``at`` rule, but you may
    also add multiple ``at`` rules. The following definitions are all equivalent:
@@ -235,7 +235,7 @@ Simulation time checkpoints
 '''''''''''''''''''''''''''
 
 Checkpoints defined in the ``simulation_time`` section are taken based on the
-time inside your simulation. It will only work correctly if all components in
+time inside your simulation. This will only work correctly if all components in
 the simulation have a shared concept of time, which only increases during the
 simulation. This should be no problem for physics-based simulations, though it
 does require that the instances make correct use of the :ref:`timestamp in
@@ -319,11 +319,11 @@ yMMSL file that can be used to :ref:`resume the simulation <Resuming a
 simulation>`.
 
 During the simulation, all of the created snapshots are stored on the file
-system. See below table for the directories where MUSCLE3 stores the files.
+system. See the table below for the directories where MUSCLE3 stores the files.
 Note: a run-directory is automatically created when using the ``--start-all``
 flag for ``muscle_manager``. You may also specify a custom run directory through
 the ``--run-dir DIRECTORY`` option. When you do not provide a run directory, the
-last column in below table indicates where snapshots are stored.
+last column in the table below indicates where snapshots are stored.
 
 .. list-table:: Directories where MUSCLE3 stores snapshot files.
     :header-rows: 1
@@ -371,7 +371,7 @@ repository. Then execute the following command:
 
         $ make test_examples
 
-Above command runs the ``muscle_manager`` and starts all components (the
+The above command runs the ``muscle_manager`` and starts all components (the
 reaction model and the diffusion model). The ``rd_checkpoints.ymmsl`` file
 contains the checkpoint definitions used in this example:
 
@@ -456,7 +456,7 @@ Resuming from *at_end* snapshots
 ''''''''''''''''''''''''''''''''
 
 .. warning::
-    Resuming from only ``at_end`` snapshot will immediately complete.
+    Resuming from an ``at_end`` snapshot only will immediately complete.
 
 
 Snapshot consistency
@@ -466,16 +466,16 @@ MUSCLE3 checkpointing was designed for consistency: no messages between the
 components must be lost when restarting. When we fulfill this criterium, a
 simulation can resume from a checkpoint as if it was never interrupted.
 
-During a simulation run, each component creates snapshots independent from all
-other components. For :ref:`simulation time checkpoints`, the MUSCLE3
+During a simulation run, each component creates snapshots independently from
+all other components. For :ref:`simulation time checkpoints`, the MUSCLE3
 checkpointing algorithm is guaranteed to give consistent :term:`workflow
 snapshots <workflow snapshot>` when all components adhere to the
 :ref:`Multiscale Modeling and Simulation Framework (MMSF) <citation needed>`.
 
 :ref:`Wallclock time checkpoints` in the currrent implementation are less
 reliable: components may take snapshots while messages are still in transit.
-When that happens, it would lead to an inconsistent state and no workflow
-snapshots would be written by ``muscle_manager``.
+When that happens an inconsistent state is produced and no workflow snapshots
+are written by ``muscle_manager``.
 
 MUSCLE3 does not support combining inconsistent snapshots, so it is not possible
 to freely mix snapshots produced during a simulation. When resuming, MUSCLE3
@@ -510,10 +510,10 @@ General troubleshooting strategy:
     this usually happens when a peer component has crashed and it is typically
     not the root cause of your simulation crash.
 
-Once you find the root cause of your problem, check below list for common issues
-and their resolutions. You may also have found a bug in MUSCLE3: please help us
-and your fellow MUSCLE3 users by :ref:`creating an issue <Make an issue>` on
-github.
+Once you find the root cause of your problem, check the list below for common
+issues and their resolutions. You may also have found a bug in MUSCLE3: please
+help us and your fellow MUSCLE3 users by :ref:`creating an issue <Make an
+issue>` on GitHub.
 
 1. The simulation crashes when using checkpoints.
     The first thing you should check is: does the simulation run error-free when
@@ -548,9 +548,10 @@ github.
 
             $ muscle_manager --run-dir run2 run1/configuration.ymmsl run1/snapshots/snapshot_xyz.ymmsl
 
-    -   One of your components has a bug with resuming from a previous snapshot,
-        or perhaps your snapshot belonged to a different version of the
-        component. Please ask your component developer(s) for help.
+    -   One of your components has a bug that is triggered when resuming from a
+        previous snapshot, or perhaps your snapshot belonged to a different
+        version of the component. Please ask your component developer(s) for
+        help.
 
 
 
@@ -769,7 +770,7 @@ Final snapshots
 '''''''''''''''
 
 Final snapshots **must** be implemented by all components supporting
-checkpointing. You implement taking final snapshot as follows:
+checkpointing. You implement taking a final snapshot as follows:
 
 1.  You must implement the checkpoint calls at the end of the :ref:`reuse loop
     <The reuse loop>`.
@@ -922,12 +923,13 @@ from a previously created checkpoint. When resuming, there are two options:
 resuming from an intermediate checkpoint and resuming from a final checkpoint.
 
 When resuming from an intermediate checkpoint, your component first loads its
-state from the checkpoint. However, we cannot just continue with the ``F_INIT``
-Operator, and instead we need to skip ahead to the point where the checkpoint
-was taken.
+state from the checkpoint. Then it should continue where it left off, which is
+at the beginning of ``O_I``. This means that it has to skip ``F_INIT`` in
+order to run as if it had never stopped.
 
 When resuming from a final checkpoint, your component first loads its state from
-the checkpoint. Next, your component executes the ``F_INIT`` operator as usual.
+the checkpoint. Next, your component executes the ``F_INIT`` operator as usual,
+as it would have had it continued after writing the snapshot.
 
 Steps to implement the resumption logic:
 
@@ -1085,8 +1087,8 @@ Components that do not keep state between reuse
 ```````````````````````````````````````````````
 
 Some components do not need to keep state between reuses. An example of that is
-the reaction model from above examples. In the final snapshot, no state needs
-to be stored to allow properly resuming this component, see
+the reaction model from the above examples. In the final snapshot, no state
+needs to be stored to allow properly resuming this component, see
 :ref:`Example: implemented checkpoint hooks`.
 
 Other examples of such components may be data transformers, receiving data
@@ -1095,7 +1097,7 @@ on an ``F_INIT`` port and sending the converted data on an ``O_F`` port.
 If you indicate to libmuscle that your component does not keep state between
 reuse, libmuscle automatically provides checkpointing for your component. You do
 this by providing the :attr:`~InstanceFlags.KEEPS_NO_STATE_FOR_NEXT_USE` flag
-when creating the instance. See below example for a variant of the example
+when creating the instance. See the below example for a variant of the example
 reaction model.
 
 .. tabs::

From c8a5574c349431a90d68384d793c6ea141ee91bc Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Mon, 16 Jan 2023 20:54:05 +0100
Subject: [PATCH 154/183] Move checkpointing deep dive to developer docs

---
 docs/source/checkpointing.rst           | 537 ------------------------
 docs/source/checkpointing_deep_dive.rst | 535 +++++++++++++++++++++++
 docs/source/for_developers.rst          |  12 +
 docs/source/index.rst                   |   5 +-
 4 files changed, 548 insertions(+), 541 deletions(-)
 create mode 100644 docs/source/checkpointing_deep_dive.rst
 create mode 100644 docs/source/for_developers.rst

diff --git a/docs/source/checkpointing.rst b/docs/source/checkpointing.rst
index a019e0a1..3e1546ba 100644
--- a/docs/source/checkpointing.rst
+++ b/docs/source/checkpointing.rst
@@ -1143,540 +1143,3 @@ for fixing the problem.
 
 ..
     TODO: create an overview of the validation rules
-
-
-Checkpointing deep-dive
------------------------
-
-This checkpointing deep-dive explains the details of the distributed
-checkpointing implemented in MUSCLE3. Usually you will not need to read or
-understand these details when you want to run simulations with checkpointing
-(see :ref:`User tutorial`) or implement checkpointing in a MUSCLE3 component
-(see :ref:`Developer tutorial`).
-
-.. contents:: Checkpointing deep-dive contents
-    :local:
-
-
-Consistency for simulation time checkpoints
-```````````````````````````````````````````
-
-In this section we take a look at the three allowed coupling types in the MMSF:
-call/release, interact and dispatch coupling. In the following sections
-we will analyze consistency for each of the coupling types.
-
-The underlying assumption is: if we can take consistent snapshots for each pair
-of coupled components, we can take consistent snapshots of the whole workflow.
-
-Call/release coupling
-'''''''''''''''''''''
-
-In this section we will look at the call/release coupling mode. The first
-example simulation consists of two components: Component 1 and Component 2. They
-are coupled as follows:
-
--   The ``O_I`` port of Component 1 is connected to the ``F_INIT`` port of
-    Component 2.
--   The ``O_F`` port of Component 2 is connected to the ``S`` port of
-    Component 1.
-
-.. code-block:: text
-    :caption: Example run for three iterations of Component 1.
-
-    Component 1:  |Fi|Oi|........ S |Oi|........ S |Oi|........ S |Of|
-                        \        /     \        /     \        /
-    Component 2:        |Fi|S |Of|..... Fi|S |Of|..... Fi|S |Of|
-
-The above schema shows the Operator (``F_INIT``, ``O_I``, ``S``, ``Of``) that
-each comonent is in during the run. The dots (``...``) indicate a blocking
-call: in this case it is the ``receive`` during the ``S`` operator of Component
-1, and the ``receive`` of the ``F_INIT`` operator of Component 2.
-
-Let's add the simulation time for each component on the example timeline.
-
--   During ``F_INIT``, the internal time is initialized. Component 1 initializes
-    to a constant ``t0``. Component 2 initializes the time to the timestamp
-    received in the message.
--   During ``S`` the state is updated and the simulation time may move forward.
-
-.. code-block:: text
-    :caption: Example run, also showing simulation times in the components.
-
-      time        |t0            |t2            |t4            |t6
-    Component 1:  |Fi|Oi|........ S |Oi|........ S |Oi|........ S |Of|
-                        \        /     \        /     \        /
-    Component 2:        |Fi|S |Of|..... Fi|S |Of|..... Fi|S |Of|
-      time              |t0|t1         |t2|t3         |t4|t5
-
-We assume that each component only moves forward in time, so
-:math:`t_0 \le t_2 \le t_4 \le t_6` and :math:`t_0 \le t_1`, :math:`t_2 \le t_3`
-and :math:`t_4 \le t_5`. The time evolution of Component 2 should be smaller
-than the time step of Component 1 in this coupling type. Therefore:
-:math:`t_1 \le t_2`, :math:`t_3 \le t_4` and :math:`t_5 \le t_6`.
-
-.. rubric:: Introducing checkpoints
-
-Component 1 can take checkpoints immediately after the ``S`` operator. Component
-2 can only take checkpoints after the ``O_F`` operator. Let's investigate what
-needs to happen when a checkpoint :math:`t_c` is requested for different values
-of :math:`t_c`:
-
-1.  :math:`t_c \leq t_0`
-2.  :math:`t_0 < t_c \leq t_1`
-3.  :math:`t_1 < t_c \leq t_2`
-4.  :math:`t_2 < t_c  \leq t_4`
-
-Note: a checkpoint :math:`t_4 < t_c  \leq t_6` would behave the same as scenario
-4, just at a later point in the simulation, so we won't work out later
-checkpoints in detail.
-
-.. tabs::
-
-    .. tab:: :math:`t_c \leq t_0`
-
-        Both components will take a snapshot at the earliest possible moment,
-        indicated with a ``C`` block in the timelines below.
-
-        You may notice that the ``C`` block in Component 2 is blocking. Although
-        the internal time of Component 2 already exceeded the checkpoint time,
-        :ref:`Final snapshots` actually determine if a snapshot should be
-        taken based on the message(s) arriving during the next ``F_INIT``.
-
-        .. rubric:: Consistency
-
-        Both snapshots have the same message counts: 1 message sent/received per
-        conduit. When resuming, Component 1 starts by sending a new message on
-        its ``O_I`` port, and Component 2 runs ``F_INIT`` as usual.
-
-        .. code-block:: text
-
-              time        |t0            |t2                  |t4            |t6
-            Component 1:  |Fi|Oi|........ S |C |Oi|........... S |Oi|........ S |Of|
-                                \        /        \           /     \        /
-            Component 2:        |Fi|S |Of|........ C |Fi|S |Of|..... Fi|S |Of|
-              time              |t0|t1               |t2|t3         |t4|t5
-
-    .. tab:: :math:`t_0 < t_c \leq t_1` and :math:`t_1 < t_c \leq t_2`
-
-        For both checkpoint times, a snapshot will be taken at the earliest
-        possible moment.
-
-        After the first ``S`` operator, Component 1 is at :math:`t=t_2` which is
-        after the checkpoint time, so it takes a snapshot. After the first reuse
-        loop, Component 2 receives a message with :math:`t=t_2` which is after
-        the checkpoint time, so it will take a snapshot at the end of the first
-        reuse loop.
-
-        .. rubric:: Consistency
-
-        Both snapshots have the same message counts: 1 message sent/received per
-        conduit. When resuming, Component 1 starts by sending a new message on
-        its ``O_I`` port, and Component 2 runs ``F_INIT`` as usual.
-
-        .. code-block:: text
-
-              time        |t0            |t2                  |t4            |t6
-            Component 1:  |Fi|Oi|........ S |C |Oi|........... S |Oi|........ S |Of|
-                                \        /        \           /     \        /
-            Component 2:        |Fi|S |Of|........ C |Fi|S |Of|..... Fi|S |Of|
-              time              |t0|t1               |t2|t3         |t4|t5
-
-    .. tab:: :math:`t_2 < t_c  \leq t_4`
-
-        Both components will take a snapshot at the earliest possible moment,
-        indicated with a ``C`` block in the timelines below.
-
-        After the first ``S`` operator, Component 1 is at :math:`t=t_2` which is
-        before the checkpoint time. After the second ``S`` operator it has
-        passed the checkpoint time, so it takes a snapshot. This works similarly
-        for Component 2.
-
-        .. rubric:: Consistency
-
-        Both snapshots have the same message counts: 2 messages sent/received
-        per conduit. When resuming, Component 1 starts by sending a new message
-        on its ``O_I`` port, and Component 2 runs ``F_INIT`` as usual.
-
-        .. code-block:: text
-
-              time        |t0            |t2            |t4                  |t6
-            Component 1:  |Fi|Oi|........ S |Oi|........ S |C |Oi|........... S |Of|
-                                \        /     \        /        \           /
-            Component 2:        |Fi|S |Of|..... Fi|S |Of|........ C |Fi|S |Of|
-              time              |t0|t1         |t2|t3               |t4|t5
-
-
-.. rubric:: Micro component with time integration and intermediate snapshots
-
-Let's see what happens when we replace Component 2 by Component 3, which does
-time integration and implements intermediate snapshots.
-
-
-.. code-block:: text
-    :caption: Example run, also showing simulation times in the components.
-
-      time        |t0                  |t4                  |t8
-    Component 1:  |Fi|Oi|.............. S |Oi|.............. S |Oi|........
-                        \              /     \              /     \
-    Component 3:        |Fi|S |S |S |Of|..... Fi|S |S |S |Of|..... Fi|S |S
-      time              |t0|t1|t2|t3         |t4|t5|t6|t7         |t8|t9|t10
-
-For the same reasons as with Component 2, :math:`t_i \leq t_{i+1}` for
-:math:`i=0,1,...`.
-
-Now, Component 3 can make intermediate snapshots between each ``S``, but also
-final snapshots. Let's see what effect that has for different checkpoint times:
-
-.. tabs::
-
-    .. tab:: :math:`t_c \leq t_1`
-
-        In this case, both components will take a snapshot at the first possible
-        moment: right after their first ``S`` block.
-
-        .. rubric:: Consistency
-
-        Now the snapshots have different message counts. For the ``O_I ->
-        F_INIT`` conduit both components see 1 message sent/received. For the
-        other conduit, however, Component 1 already received a message that is
-        not sent in Component 3's snapshot.
-
-        When resuming, Component 3 resumes in its state update loop and sends a
-        message back to Component 1 during ``O_F``. This message is discarded by
-        Component 1. From that point, the simulation can resume as usual.
-
-        .. code-block:: text
-
-              time        |t0                     |t4                     |t8
-            Component 1:  |Fi|Oi|................. S |C |Oi|.............. S |Oi|........
-                                \                 /        \              /     \
-            Component 3:        |Fi|S |C |S |S |Of|........ Fi|S |S |S |Of|..... Fi|S |S
-              time              |t0|t1   |t2|t3            |t4|t5|t6|t7         |t8|t9|t10
-
-    .. tab:: :math:`t_1 < t_c \leq t_2`
-
-        This is quite similar to the previous case. The difference is that
-        Component 3 takes its snapshot after the second ``S`` block.
-
-        .. code-block:: text
-
-              time        |t0                     |t4                     |t8
-            Component 1:  |Fi|Oi|................. S |C |Oi|.............. S |Oi|........
-                                \                 /        \              /     \
-            Component 3:        |Fi|S |S |C |S |Of|........ Fi|S |S |S |Of|..... Fi|S |S
-              time              |t0|t1|t2   |t3            |t4|t5|t6|t7         |t8|t9|t10
-
-    .. tab:: :math:`t_3 < t_c \leq t_4`
-
-        The checkpoint for Component 1 does not change. However, in this case
-        Component 3 takes a :ref:`final snapshot <Final snapshots>` instead of
-        an :ref:`intermediate snapshot <Intermediate snapshots>`.
-
-        .. rubric:: Consistency
-
-        Both snapshots have the same message counts: 1 message sent/received
-        per conduit. When resuming, Component 1 starts by sending a new message
-        on its ``O_I`` port, and Component 2 runs ``F_INIT`` as usual.
-
-        .. code-block:: text
-
-              time        |t0                  |t4                        |t8
-            Component 1:  |Fi|Oi|.............. S |C |Oi|................. S |Oi|........
-                                \              /        \                 /     \
-            Component 3:        |Fi|S |S |S |Of|........ C |Fi|S |S |S |Of|..... Fi|S |S
-              time              |t0|t1|t2|t3               |t4|t5|t6|t7         |t8|t9|t10
-
-
-Interact coupling
-'''''''''''''''''
-
-In this section we will look at the interact coupling mode. This example
-simulation consists of two components: Component 1 and Component 2. They are
-coupled as follows:
-
--   The ``O_I`` port of Component 1 is connected to the ``S`` port of
-    Component 2.
--   The ``O_I`` port of Component 2 is connected to the ``S`` port of
-    Component 1.
-
-.. code-block:: text
-    :caption: Example lock-step interact run for three iterations.
-
-      time        |t0   |t1   |t2   |t3
-    Component 1:  |Fi|Oi|S |Oi|S |Oi|S |Of|
-                        X     X     X
-    Component 2:  |Fi|Oi|S |Oi|S |Oi|S |Of|
-      time        |t0   |t1   |t2   |t3
-
-Let's see what happens for different checkpoint times:
-
-.. tabs::
-
-    .. tab:: :math:`t_c \leq t_1`
-
-        In this case, both components make a snapshot After the first ``S``
-        block.
-
-        .. rubric:: Consistency
-
-        Both snapshots have the same message counts: 1 message sent/received
-        per conduit. When resuming, both components send the next message at
-        ``O_I`` and continue with their ``S``.
-
-        .. code-block:: text
-
-                  time        |t0   |t1      |t2   |t3
-                Component 1:  |Fi|Oi|S |C |Oi|S |Oi|S |Of|
-                                    X        X     X
-                Component 2:  |Fi|Oi|S |C |Oi|S |Oi|S |Of|
-                  time        |t0   |t1      |t2   |t3
-
-    .. tab:: :math:`t_1 < t_c \leq t_2`
-
-        This is almost the same as on the previous tab, just at a later point in
-        the run.
-
-        .. code-block:: text
-
-                  time        |t0   |t1   |t2      |t3
-                Component 1:  |Fi|Oi|S |Oi|S |C |Oi|S |Of|
-                                    X     X        X
-                Component 2:  |Fi|Oi|S |Oi|S |C |Oi|S |Of|
-                  time        |t0   |t1   |t2      |t3
-
-If the two components do not use the same time step, a scale bridge is required
-to interpolate. See ``docs/source/examples/python/interact_coupling.py`` for an
-implementation of such a component. The timeline becomes a bit more complicated
-now:
-
-.. code-block:: text
-    :caption: Example interact run. Component 1 has a smaller time step than Component 2.
-
-      time        |t0            |t1                     |t2         |t4
-    Component 1:  |Fi|Oi|........ S |Oi|................. S |Oi|..... S |Oi|...........
-                        \        /     \                 /     \     /     \
-    Scale bridge:       |S |S |Oi|..... S |Oi|..... S |Oi|..... S |Oi|..... S |Oi|.....
-                           /                 \     /                             \
-    Component 2:     |Fi|Oi|................. S |Oi|............................. S |Oi
-      time           |t0                     |t3                                 |t5
-
-Let's see what happens for different checkpoint times:
-
-.. tabs::
-
-    .. tab:: :math:`t_c \leq t_0`
-
-        In this case, both components make a snapshot after the first ``S``
-        block. The scale bridge creates a snapshot after the first two ``S`` are
-        complete.
-
-        .. rubric:: Consistency
-
-        Both component snapshots have received one more message on ``S`` than
-        the scale bridge has sent. This is no problem: when resuming, the scale
-        bridge will send the messages again, but those are discarded by both
-        components.
-
-        .. code-block:: text
-
-              time        |t0               |t1                           |t2         |t4
-            Component 1:  |Fi|Oi|........... S |C |Oi|.................... S |Oi|..... S |Oi|...........
-                                \           /        \                    /     \     /     \
-            Scale bridge:       |S |S |C |Oi|........ S |Oi|........ S |Oi|..... S |Oi|..... S |Oi|.....
-                                /                          \        /                             \
-            Component 2:     |Fi|Oi|....................... S |C |Oi|............................. S |Oi
-              time           |t0                           |t3                                    |t5
-
-    .. tab:: :math:`t_0 < t_c \leq t_1`
-
-        In this case, both components make a snapshot after the first ``S``
-        block. The scale bridge creates a snapshot after receiving the second
-        message from Component 2.
-
-        .. rubric:: Consistency
-
-        In this case, the scale bridge has received one more message on its
-        ``S`` port at its checkpoint moment, than the components have sent at
-        their checkpoints. Again, this is no problem: the components send their
-        messages again when resuming, but these are discarded by the scale
-        bridge.
-
-        .. code-block:: text
-
-              time        |t0            |t1                              |t2         |t4
-            Component 1:  |Fi|Oi|........ S |C |Oi|....................... S |Oi|..... S |Oi|...........
-                                \        /        \                       /     \     /     \
-            Scale bridge:       |S |S |Oi|........ S |Oi|........ S |C |Oi|..... S |Oi|..... S |Oi|.....
-                                /                       \        /                                \
-            Component 2:     |Fi|Oi|.................... S |C |Oi|................................ S |Oi
-              time           |t0                        |t3                                       |t5
-
-    .. tab:: :math:`t_1 < t_c \leq t_2`
-
-        Now component 1 takes a snapshot after its second ``S`` phase. Component
-        still takes a snapshot after its first ``S`` phase. The scale bridge
-        checkpoints after receiving the third message from Component 1.
-
-        .. rubric:: Consistency
-
-        Again, the scale bridge has received one more message on its
-        ``S`` port at its checkpoint moment, than the components have sent at
-        their checkpoints. Again, this is no problem: the components send their
-        messages again when resuming, but these are discarded by the scale
-        bridge.
-
-        .. code-block:: text
-
-              time        |t0            |t1                        |t2               |t4
-            Component 1:  |Fi|Oi|........ S |Oi|.................... S |C |Oi|......... S |Oi|...........
-                                \        /     \                    /        \        /     \
-            Scale bridge:       |S |S |Oi|..... S |Oi|........ S |Oi|........ S |C |Oi|..... S |Oi|.....
-                                /                    \        /                                   \
-            Component 2:     |Fi|Oi|................. S |C |Oi|................................... S |Oi
-              time           |t0                     |t3                                          |t5
-
-
-Dispatch coupling
-'''''''''''''''''
-
-Finally, we take a look at two component coupled in dispatch:
-
--   The ``O_F`` port of Component 1 is connected to the ``F_INIT`` port of
-    Component 2.
-
-This leads to the following timeline:
-
-.. code-block:: text
-    :caption: Example lock-step interact run for three iterations.
-
-      time        |t0|t1|t2|t3
-    Component 1:  |Fi|S |S |S |Of|
-                                 \
-    Component 2:                 |Fi|S |S |S |Of|
-      time                       |t3|t4|t5|t6
-
-.. tabs::
-
-    .. tab:: :math:`t_c \leq t_1`
-
-        In this case, both components make a snapshot after the first ``S``
-        block.
-
-        .. rubric:: Consistency
-
-        The snapshot of Component 1 can be combined with the snapshot of
-        Component 2, but then all remaining work of Component 1 will be ignored
-        by Component 2. It is also possible to restart Component 2 from scratch
-        (this is also consistent).
-
-        .. code-block:: text
-
-              time        |t0|t1   |t2|t3
-            Component 1:  |Fi|S |C |S |S |Of|
-                                            \
-            Component 2:                    |Fi|S |C |S |S |Of|
-              time                          |t3|t4|   t5|t6
-
-    .. tab:: :math:`t_1 < t_c \leq t_2`
-
-        This is similar to the previous tab. However, Component 1 takes a
-        snapshot at a later point.
-
-
-    .. tab:: :math:`t_3 < t_c \leq t_4`
-
-        In this case, Component 1 does not take a snapshot, unless either:
-
-        1.  A :ref:`checkpoint rule is defined <Defining checkpoints>` for
-            ``at_end``, or
-        2.  Component 1 is executed again (for example, when this is a
-            sub-workflow in a call/release coupling) and a final snapshot is
-            triggered.
-
-        .. rubric:: Consistency
-
-        When a final snapshot is taken by Component 1, it will be consistent
-        with any checkpoint taken during the exeuction of Component 2 and we can
-        restart the workflow.
-
-        .. code-block:: text
-
-              time        |t0|t1|t2|t3
-            Component 1:  |Fi|S |S |S |Of|C?
-                                         \
-            Component 2:                 |Fi|S |C |S |S |Of|
-              time                       |t3|t4|   t5|t6
-
-
-(In)consistency for wallclock time checkpoints
-``````````````````````````````````````````````
-
-In the current implementation, wallclock time checkpoints are taken as soon as
-possible after exceeding a certain wallclock time. Let's look at an example
-where this is not leading to consistent workflow snapshots.
-
-This example is similar to the :ref:`Interact coupling` example seen previously.
-
--   The ``O_I`` port of Component 1 is connected to the ``S`` port of
-    Component 2.
--   The ``O_I`` port of Component 2 is connected to the ``S`` port of
-    Component 1.
-
-However, let's now look at the wallclock time and assume that Component 1's
-``S`` Operator takes longer than Component 2's, compute time indicated by
-``~~``:
-
-.. code-block:: text
-
-    Wallclock time:         |w1|w2    |w3|w4
-    Component 1:  |Fi|Oi|.S ~~~|Oi|.S ~~~|Oi|.S ~~~|Of|
-                        \/      __\/      __\/
-                        /\     /   \     /   \
-    Component 2:  |Fi|Oi|.S |Oi|... S |Oi|... S |Of|
-
-Because Component 1 spends more time in ``S``, Component 2 is waiting in each
-following iteration of ``S``. Let's see what happens for different wallclock
-time checkpoint moments :math:`w_c`:
-
-.. tabs::
-
-    .. tab:: :math:`w_c \leq w_1`
-
-        In this case, both components make a snapshot after the first ``S``
-        block.
-
-        .. rubric:: Consistency
-
-        At the moment of snapshot, both components have the same number of
-        messages sent/received on their conduits. This is consistent.
-
-        .. code-block:: text
-
-            Wallclock time:         |w1|w2       |w3|w4
-            Component 1:  |Fi|Oi|.S ~~~|C |Oi|.S ~~~|Oi|.S ~~~|Of|
-                                \/         __\/      __\/
-                                /\        /   \     /   \
-            Component 2:  |Fi|Oi|.S |C |Oi|... S |Oi|... S |Of|
-
-    .. tab:: :math:`w_1 < w_c \leq w_2`
-
-        Component 1 takes a snapshot after the first ``S`` block, but Component
-        2 after its second ``S`` block.
-
-        .. rubric:: Consistency
-
-        The created snapshots are not consistent: Component 2 has sent 1 more
-        message than Component 1 has received. When resuming Component 1 would
-        wait for a message that never comes again, so this is not a valid resume
-        point.
-
-        .. code-block:: text
-
-            Wallclock time:         |w1|w2       |w3|w4
-            Component 1:  |Fi|Oi|.S ~~~|C |Oi|.S ~~~|Oi|.S ~~~|Of|
-                                \/      _____\/        \/
-                                /\     /      \        /\
-            Component 2:  |Fi|Oi|.S |Oi|...... S |C |Oi|. S |Of|
-
-As you can see, the second scenario does not lead to consistent checkpoints.
-
diff --git a/docs/source/checkpointing_deep_dive.rst b/docs/source/checkpointing_deep_dive.rst
new file mode 100644
index 00000000..5b5ef170
--- /dev/null
+++ b/docs/source/checkpointing_deep_dive.rst
@@ -0,0 +1,535 @@
+Checkpointing deep-dive
+=======================
+
+This checkpointing deep-dive explains the details of the distributed
+checkpointing implemented in MUSCLE3. Usually you will not need to read or
+understand these details when you want to run simulations with checkpointing
+(see :ref:`User tutorial`) or implement checkpointing in a MUSCLE3 component
+(see :ref:`Developer tutorial`).
+
+.. contents:: Checkpointing deep-dive contents
+    :local:
+
+
+Consistency for simulation time checkpoints
+-------------------------------------------
+
+In this section we take a look at the three allowed coupling types in the MMSF:
+call/release, interact and dispatch coupling. In the following sections
+we will analyze consistency for each of the coupling types.
+
+The underlying assumption is: if we can take consistent snapshots for each pair
+of coupled components, we can take consistent snapshots of the whole workflow.
+
+Call/release coupling
+`````````````````````
+
+In this section we will look at the call/release coupling mode. The first
+example simulation consists of two components: Component 1 and Component 2. They
+are coupled as follows:
+
+-   The ``O_I`` port of Component 1 is connected to the ``F_INIT`` port of
+    Component 2.
+-   The ``O_F`` port of Component 2 is connected to the ``S`` port of
+    Component 1.
+
+.. code-block:: text
+    :caption: Example run for three iterations of Component 1.
+
+    Component 1:  |Fi|Oi|........ S |Oi|........ S |Oi|........ S |Of|
+                        \        /     \        /     \        /
+    Component 2:        |Fi|S |Of|..... Fi|S |Of|..... Fi|S |Of|
+
+The above schema shows the Operator (``F_INIT``, ``O_I``, ``S``, ``Of``) that
+each comonent is in during the run. The dots (``...``) indicate a blocking
+call: in this case it is the ``receive`` during the ``S`` operator of Component
+1, and the ``receive`` of the ``F_INIT`` operator of Component 2.
+
+Let's add the simulation time for each component on the example timeline.
+
+-   During ``F_INIT``, the internal time is initialized. Component 1 initializes
+    to a constant ``t0``. Component 2 initializes the time to the timestamp
+    received in the message.
+-   During ``S`` the state is updated and the simulation time may move forward.
+
+.. code-block:: text
+    :caption: Example run, also showing simulation times in the components.
+
+      time        |t0            |t2            |t4            |t6
+    Component 1:  |Fi|Oi|........ S |Oi|........ S |Oi|........ S |Of|
+                        \        /     \        /     \        /
+    Component 2:        |Fi|S |Of|..... Fi|S |Of|..... Fi|S |Of|
+      time              |t0|t1         |t2|t3         |t4|t5
+
+We assume that each component only moves forward in time, so
+:math:`t_0 \le t_2 \le t_4 \le t_6` and :math:`t_0 \le t_1`, :math:`t_2 \le t_3`
+and :math:`t_4 \le t_5`. The time evolution of Component 2 should be smaller
+than the time step of Component 1 in this coupling type. Therefore:
+:math:`t_1 \le t_2`, :math:`t_3 \le t_4` and :math:`t_5 \le t_6`.
+
+.. rubric:: Introducing checkpoints
+
+Component 1 can take checkpoints immediately after the ``S`` operator. Component
+2 can only take checkpoints after the ``O_F`` operator. Let's investigate what
+needs to happen when a checkpoint :math:`t_c` is requested for different values
+of :math:`t_c`:
+
+1.  :math:`t_c \leq t_0`
+2.  :math:`t_0 < t_c \leq t_1`
+3.  :math:`t_1 < t_c \leq t_2`
+4.  :math:`t_2 < t_c  \leq t_4`
+
+Note: a checkpoint :math:`t_4 < t_c  \leq t_6` would behave the same as scenario
+4, just at a later point in the simulation, so we won't work out later
+checkpoints in detail.
+
+.. tabs::
+
+    .. tab:: :math:`t_c \leq t_0`
+
+        Both components will take a snapshot at the earliest possible moment,
+        indicated with a ``C`` block in the timelines below.
+
+        You may notice that the ``C`` block in Component 2 is blocking. Although
+        the internal time of Component 2 already exceeded the checkpoint time,
+        :ref:`Final snapshots` actually determine if a snapshot should be
+        taken based on the message(s) arriving during the next ``F_INIT``.
+
+        .. rubric:: Consistency
+
+        Both snapshots have the same message counts: 1 message sent/received per
+        conduit. When resuming, Component 1 starts by sending a new message on
+        its ``O_I`` port, and Component 2 runs ``F_INIT`` as usual.
+
+        .. code-block:: text
+
+              time        |t0            |t2                  |t4            |t6
+            Component 1:  |Fi|Oi|........ S |C |Oi|........... S |Oi|........ S |Of|
+                                \        /        \           /     \        /
+            Component 2:        |Fi|S |Of|........ C |Fi|S |Of|..... Fi|S |Of|
+              time              |t0|t1               |t2|t3         |t4|t5
+
+    .. tab:: :math:`t_0 < t_c \leq t_1` and :math:`t_1 < t_c \leq t_2`
+
+        For both checkpoint times, a snapshot will be taken at the earliest
+        possible moment.
+
+        After the first ``S`` operator, Component 1 is at :math:`t=t_2` which is
+        after the checkpoint time, so it takes a snapshot. After the first reuse
+        loop, Component 2 receives a message with :math:`t=t_2` which is after
+        the checkpoint time, so it will take a snapshot at the end of the first
+        reuse loop.
+
+        .. rubric:: Consistency
+
+        Both snapshots have the same message counts: 1 message sent/received per
+        conduit. When resuming, Component 1 starts by sending a new message on
+        its ``O_I`` port, and Component 2 runs ``F_INIT`` as usual.
+
+        .. code-block:: text
+
+              time        |t0            |t2                  |t4            |t6
+            Component 1:  |Fi|Oi|........ S |C |Oi|........... S |Oi|........ S |Of|
+                                \        /        \           /     \        /
+            Component 2:        |Fi|S |Of|........ C |Fi|S |Of|..... Fi|S |Of|
+              time              |t0|t1               |t2|t3         |t4|t5
+
+    .. tab:: :math:`t_2 < t_c  \leq t_4`
+
+        Both components will take a snapshot at the earliest possible moment,
+        indicated with a ``C`` block in the timelines below.
+
+        After the first ``S`` operator, Component 1 is at :math:`t=t_2` which is
+        before the checkpoint time. After the second ``S`` operator it has
+        passed the checkpoint time, so it takes a snapshot. This works similarly
+        for Component 2.
+
+        .. rubric:: Consistency
+
+        Both snapshots have the same message counts: 2 messages sent/received
+        per conduit. When resuming, Component 1 starts by sending a new message
+        on its ``O_I`` port, and Component 2 runs ``F_INIT`` as usual.
+
+        .. code-block:: text
+
+              time        |t0            |t2            |t4                  |t6
+            Component 1:  |Fi|Oi|........ S |Oi|........ S |C |Oi|........... S |Of|
+                                \        /     \        /        \           /
+            Component 2:        |Fi|S |Of|..... Fi|S |Of|........ C |Fi|S |Of|
+              time              |t0|t1         |t2|t3               |t4|t5
+
+
+.. rubric:: Micro component with time integration and intermediate snapshots
+
+Let's see what happens when we replace Component 2 by Component 3, which does
+time integration and implements intermediate snapshots.
+
+
+.. code-block:: text
+    :caption: Example run, also showing simulation times in the components.
+
+      time        |t0                  |t4                  |t8
+    Component 1:  |Fi|Oi|.............. S |Oi|.............. S |Oi|........
+                        \              /     \              /     \
+    Component 3:        |Fi|S |S |S |Of|..... Fi|S |S |S |Of|..... Fi|S |S
+      time              |t0|t1|t2|t3         |t4|t5|t6|t7         |t8|t9|t10
+
+For the same reasons as with Component 2, :math:`t_i \leq t_{i+1}` for
+:math:`i=0,1,...`.
+
+Now, Component 3 can make intermediate snapshots between each ``S``, but also
+final snapshots. Let's see what effect that has for different checkpoint times:
+
+.. tabs::
+
+    .. tab:: :math:`t_c \leq t_1`
+
+        In this case, both components will take a snapshot at the first possible
+        moment: right after their first ``S`` block.
+
+        .. rubric:: Consistency
+
+        Now the snapshots have different message counts. For the ``O_I ->
+        F_INIT`` conduit both components see 1 message sent/received. For the
+        other conduit, however, Component 1 already received a message that is
+        not sent in Component 3's snapshot.
+
+        When resuming, Component 3 resumes in its state update loop and sends a
+        message back to Component 1 during ``O_F``. This message is discarded by
+        Component 1. From that point, the simulation can resume as usual.
+
+        .. code-block:: text
+
+              time        |t0                     |t4                     |t8
+            Component 1:  |Fi|Oi|................. S |C |Oi|.............. S |Oi|........
+                                \                 /        \              /     \
+            Component 3:        |Fi|S |C |S |S |Of|........ Fi|S |S |S |Of|..... Fi|S |S
+              time              |t0|t1   |t2|t3            |t4|t5|t6|t7         |t8|t9|t10
+
+    .. tab:: :math:`t_1 < t_c \leq t_2`
+
+        This is quite similar to the previous case. The difference is that
+        Component 3 takes its snapshot after the second ``S`` block.
+
+        .. code-block:: text
+
+              time        |t0                     |t4                     |t8
+            Component 1:  |Fi|Oi|................. S |C |Oi|.............. S |Oi|........
+                                \                 /        \              /     \
+            Component 3:        |Fi|S |S |C |S |Of|........ Fi|S |S |S |Of|..... Fi|S |S
+              time              |t0|t1|t2   |t3            |t4|t5|t6|t7         |t8|t9|t10
+
+    .. tab:: :math:`t_3 < t_c \leq t_4`
+
+        The checkpoint for Component 1 does not change. However, in this case
+        Component 3 takes a :ref:`final snapshot <Final snapshots>` instead of
+        an :ref:`intermediate snapshot <Intermediate snapshots>`.
+
+        .. rubric:: Consistency
+
+        Both snapshots have the same message counts: 1 message sent/received
+        per conduit. When resuming, Component 1 starts by sending a new message
+        on its ``O_I`` port, and Component 2 runs ``F_INIT`` as usual.
+
+        .. code-block:: text
+
+              time        |t0                  |t4                        |t8
+            Component 1:  |Fi|Oi|.............. S |C |Oi|................. S |Oi|........
+                                \              /        \                 /     \
+            Component 3:        |Fi|S |S |S |Of|........ C |Fi|S |S |S |Of|..... Fi|S |S
+              time              |t0|t1|t2|t3               |t4|t5|t6|t7         |t8|t9|t10
+
+
+Interact coupling
+`````````````````
+
+In this section we will look at the interact coupling mode. This example
+simulation consists of two components: Component 1 and Component 2. They are
+coupled as follows:
+
+-   The ``O_I`` port of Component 1 is connected to the ``S`` port of
+    Component 2.
+-   The ``O_I`` port of Component 2 is connected to the ``S`` port of
+    Component 1.
+
+.. code-block:: text
+    :caption: Example lock-step interact run for three iterations.
+
+      time        |t0   |t1   |t2   |t3
+    Component 1:  |Fi|Oi|S |Oi|S |Oi|S |Of|
+                        X     X     X
+    Component 2:  |Fi|Oi|S |Oi|S |Oi|S |Of|
+      time        |t0   |t1   |t2   |t3
+
+Let's see what happens for different checkpoint times:
+
+.. tabs::
+
+    .. tab:: :math:`t_c \leq t_1`
+
+        In this case, both components make a snapshot After the first ``S``
+        block.
+
+        .. rubric:: Consistency
+
+        Both snapshots have the same message counts: 1 message sent/received
+        per conduit. When resuming, both components send the next message at
+        ``O_I`` and continue with their ``S``.
+
+        .. code-block:: text
+
+                  time        |t0   |t1      |t2   |t3
+                Component 1:  |Fi|Oi|S |C |Oi|S |Oi|S |Of|
+                                    X        X     X
+                Component 2:  |Fi|Oi|S |C |Oi|S |Oi|S |Of|
+                  time        |t0   |t1      |t2   |t3
+
+    .. tab:: :math:`t_1 < t_c \leq t_2`
+
+        This is almost the same as on the previous tab, just at a later point in
+        the run.
+
+        .. code-block:: text
+
+                  time        |t0   |t1   |t2      |t3
+                Component 1:  |Fi|Oi|S |Oi|S |C |Oi|S |Of|
+                                    X     X        X
+                Component 2:  |Fi|Oi|S |Oi|S |C |Oi|S |Of|
+                  time        |t0   |t1   |t2      |t3
+
+If the two components do not use the same time step, a scale bridge is required
+to interpolate. See ``docs/source/examples/python/interact_coupling.py`` for an
+implementation of such a component. The timeline becomes a bit more complicated
+now:
+
+.. code-block:: text
+    :caption: Example interact run. Component 1 has a smaller time step than Component 2.
+
+      time        |t0            |t1                     |t2         |t4
+    Component 1:  |Fi|Oi|........ S |Oi|................. S |Oi|..... S |Oi|...........
+                        \        /     \                 /     \     /     \
+    Scale bridge:       |S |S |Oi|..... S |Oi|..... S |Oi|..... S |Oi|..... S |Oi|.....
+                           /                 \     /                             \
+    Component 2:     |Fi|Oi|................. S |Oi|............................. S |Oi
+      time           |t0                     |t3                                 |t5
+
+Let's see what happens for different checkpoint times:
+
+.. tabs::
+
+    .. tab:: :math:`t_c \leq t_0`
+
+        In this case, both components make a snapshot after the first ``S``
+        block. The scale bridge creates a snapshot after the first two ``S`` are
+        complete.
+
+        .. rubric:: Consistency
+
+        Both component snapshots have received one more message on ``S`` than
+        the scale bridge has sent. This is no problem: when resuming, the scale
+        bridge will send the messages again, but those are discarded by both
+        components.
+
+        .. code-block:: text
+
+              time        |t0               |t1                           |t2         |t4
+            Component 1:  |Fi|Oi|........... S |C |Oi|.................... S |Oi|..... S |Oi|...........
+                                \           /        \                    /     \     /     \
+            Scale bridge:       |S |S |C |Oi|........ S |Oi|........ S |Oi|..... S |Oi|..... S |Oi|.....
+                                /                          \        /                             \
+            Component 2:     |Fi|Oi|....................... S |C |Oi|............................. S |Oi
+              time           |t0                           |t3                                    |t5
+
+    .. tab:: :math:`t_0 < t_c \leq t_1`
+
+        In this case, both components make a snapshot after the first ``S``
+        block. The scale bridge creates a snapshot after receiving the second
+        message from Component 2.
+
+        .. rubric:: Consistency
+
+        In this case, the scale bridge has received one more message on its
+        ``S`` port at its checkpoint moment, than the components have sent at
+        their checkpoints. Again, this is no problem: the components send their
+        messages again when resuming, but these are discarded by the scale
+        bridge.
+
+        .. code-block:: text
+
+              time        |t0            |t1                              |t2         |t4
+            Component 1:  |Fi|Oi|........ S |C |Oi|....................... S |Oi|..... S |Oi|...........
+                                \        /        \                       /     \     /     \
+            Scale bridge:       |S |S |Oi|........ S |Oi|........ S |C |Oi|..... S |Oi|..... S |Oi|.....
+                                /                       \        /                                \
+            Component 2:     |Fi|Oi|.................... S |C |Oi|................................ S |Oi
+              time           |t0                        |t3                                       |t5
+
+    .. tab:: :math:`t_1 < t_c \leq t_2`
+
+        Now component 1 takes a snapshot after its second ``S`` phase. Component
+        still takes a snapshot after its first ``S`` phase. The scale bridge
+        checkpoints after receiving the third message from Component 1.
+
+        .. rubric:: Consistency
+
+        Again, the scale bridge has received one more message on its
+        ``S`` port at its checkpoint moment, than the components have sent at
+        their checkpoints. Again, this is no problem: the components send their
+        messages again when resuming, but these are discarded by the scale
+        bridge.
+
+        .. code-block:: text
+
+              time        |t0            |t1                        |t2               |t4
+            Component 1:  |Fi|Oi|........ S |Oi|.................... S |C |Oi|......... S |Oi|...........
+                                \        /     \                    /        \        /     \
+            Scale bridge:       |S |S |Oi|..... S |Oi|........ S |Oi|........ S |C |Oi|..... S |Oi|.....
+                                /                    \        /                                   \
+            Component 2:     |Fi|Oi|................. S |C |Oi|................................... S |Oi
+              time           |t0                     |t3                                          |t5
+
+
+Dispatch coupling
+`````````````````
+
+Finally, we take a look at two component coupled in dispatch:
+
+-   The ``O_F`` port of Component 1 is connected to the ``F_INIT`` port of
+    Component 2.
+
+This leads to the following timeline:
+
+.. code-block:: text
+    :caption: Example lock-step interact run for three iterations.
+
+      time        |t0|t1|t2|t3
+    Component 1:  |Fi|S |S |S |Of|
+                                 \
+    Component 2:                 |Fi|S |S |S |Of|
+      time                       |t3|t4|t5|t6
+
+.. tabs::
+
+    .. tab:: :math:`t_c \leq t_1`
+
+        In this case, both components make a snapshot after the first ``S``
+        block.
+
+        .. rubric:: Consistency
+
+        The snapshot of Component 1 can be combined with the snapshot of
+        Component 2, but then all remaining work of Component 1 will be ignored
+        by Component 2. It is also possible to restart Component 2 from scratch
+        (this is also consistent).
+
+        .. code-block:: text
+
+              time        |t0|t1   |t2|t3
+            Component 1:  |Fi|S |C |S |S |Of|
+                                            \
+            Component 2:                    |Fi|S |C |S |S |Of|
+              time                          |t3|t4|   t5|t6
+
+    .. tab:: :math:`t_1 < t_c \leq t_2`
+
+        This is similar to the previous tab. However, Component 1 takes a
+        snapshot at a later point.
+
+
+    .. tab:: :math:`t_3 < t_c \leq t_4`
+
+        In this case, Component 1 does not take a snapshot, unless either:
+
+        1.  A :ref:`checkpoint rule is defined <Defining checkpoints>` for
+            ``at_end``, or
+        2.  Component 1 is executed again (for example, when this is a
+            sub-workflow in a call/release coupling) and a final snapshot is
+            triggered.
+
+        .. rubric:: Consistency
+
+        When a final snapshot is taken by Component 1, it will be consistent
+        with any checkpoint taken during the exeuction of Component 2 and we can
+        restart the workflow.
+
+        .. code-block:: text
+
+              time        |t0|t1|t2|t3
+            Component 1:  |Fi|S |S |S |Of|C?
+                                         \
+            Component 2:                 |Fi|S |C |S |S |Of|
+              time                       |t3|t4|   t5|t6
+
+
+(In)consistency for wallclock time checkpoints
+----------------------------------------------
+
+In the current implementation, wallclock time checkpoints are taken as soon as
+possible after exceeding a certain wallclock time. Let's look at an example
+where this is not leading to consistent workflow snapshots.
+
+This example is similar to the :ref:`Interact coupling` example seen previously.
+
+-   The ``O_I`` port of Component 1 is connected to the ``S`` port of
+    Component 2.
+-   The ``O_I`` port of Component 2 is connected to the ``S`` port of
+    Component 1.
+
+However, let's now look at the wallclock time and assume that Component 1's
+``S`` Operator takes longer than Component 2's, compute time indicated by
+``~~``:
+
+.. code-block:: text
+
+    Wallclock time:         |w1|w2    |w3|w4
+    Component 1:  |Fi|Oi|.S ~~~|Oi|.S ~~~|Oi|.S ~~~|Of|
+                        \/      __\/      __\/
+                        /\     /   \     /   \
+    Component 2:  |Fi|Oi|.S |Oi|... S |Oi|... S |Of|
+
+Because Component 1 spends more time in ``S``, Component 2 is waiting in each
+following iteration of ``S``. Let's see what happens for different wallclock
+time checkpoint moments :math:`w_c`:
+
+.. tabs::
+
+    .. tab:: :math:`w_c \leq w_1`
+
+        In this case, both components make a snapshot after the first ``S``
+        block.
+
+        .. rubric:: Consistency
+
+        At the moment of snapshot, both components have the same number of
+        messages sent/received on their conduits. This is consistent.
+
+        .. code-block:: text
+
+            Wallclock time:         |w1|w2       |w3|w4
+            Component 1:  |Fi|Oi|.S ~~~|C |Oi|.S ~~~|Oi|.S ~~~|Of|
+                                \/         __\/      __\/
+                                /\        /   \     /   \
+            Component 2:  |Fi|Oi|.S |C |Oi|... S |Oi|... S |Of|
+
+    .. tab:: :math:`w_1 < w_c \leq w_2`
+
+        Component 1 takes a snapshot after the first ``S`` block, but Component
+        2 after its second ``S`` block.
+
+        .. rubric:: Consistency
+
+        The created snapshots are not consistent: Component 2 has sent 1 more
+        message than Component 1 has received. When resuming Component 1 would
+        wait for a message that never comes again, so this is not a valid resume
+        point.
+
+        .. code-block:: text
+
+            Wallclock time:         |w1|w2       |w3|w4
+            Component 1:  |Fi|Oi|.S ~~~|C |Oi|.S ~~~|Oi|.S ~~~|Of|
+                                \/      _____\/        \/
+                                /\     /      \        /\
+            Component 2:  |Fi|Oi|.S |Oi|...... S |C |Oi|. S |Of|
+
+As you can see, the second scenario does not lead to consistent checkpoints.
+
diff --git a/docs/source/for_developers.rst b/docs/source/for_developers.rst
new file mode 100644
index 00000000..0ef3e9ef
--- /dev/null
+++ b/docs/source/for_developers.rst
@@ -0,0 +1,12 @@
+For Developers
+**************
+
+.. toctree::
+   :maxdepth: 4
+   :caption: Contents:
+
+   contributing
+   devtools
+   releasing
+   checkpointing_deep_dive
+
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 1e7c05d0..b6368f00 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -42,10 +42,7 @@ Cham.  `<https://doi.org/10.1007/978-3-030-50433-5_33>`_
    python_api
    cpp_api
    fortran_api
-
-   contributing
-   devtools
-   releasing
+   for_developers
 
 
 Indices and tables

From e1ff028f2527f1a9bfdca88719ce15f49a789e23 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Mon, 16 Jan 2023 21:15:28 +0100
Subject: [PATCH 155/183] Try to fix Python API docs

---
 tox.ini | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tox.ini b/tox.ini
index 2a98e75a..16062995 100644
--- a/tox.ini
+++ b/tox.ini
@@ -41,5 +41,6 @@ deps =
     sphinx-fortran
     sphinx_rtd_theme
     sphinx-tabs
+    git+https://github.com/multiscale/ymmsl-python.git@develop#egg=ymmsl
 commands = sphinx-build docs/source docs/build -bhtml
 

From 4bb45311b3efcfb433e896d6cd4d04864c6409d3 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Mon, 16 Jan 2023 22:30:26 +0100
Subject: [PATCH 156/183] Fix some small issues to try to fix RTD Python API
 build

---
 docs/source/devtools.rst                      |  1 +
 libmuscle/python/libmuscle/manager/run_dir.py | 31 ++++++++++---------
 .../libmuscle/manager/snapshot_registry.py    |  2 +-
 3 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/docs/source/devtools.rst b/docs/source/devtools.rst
index 3782fbce..7ff352d8 100644
--- a/docs/source/devtools.rst
+++ b/docs/source/devtools.rst
@@ -129,6 +129,7 @@ documentation and display it online. This uses Sphinx, but does not support
 
 .. _`setuptools`: https://setuptools.readthedocs.io
 .. _`Requires.io`: https://requires.io/
+.. _`tox`: https://tox.wiki
 .. _`flake8`: https://github.com/PyCQA/flake8
 .. _`mypy`: https://mypy.readthedocs.io
 .. _`Codacy`: https://support.codacy.com
diff --git a/libmuscle/python/libmuscle/manager/run_dir.py b/libmuscle/python/libmuscle/manager/run_dir.py
index 186d32e8..fdbfbed0 100644
--- a/libmuscle/python/libmuscle/manager/run_dir.py
+++ b/libmuscle/python/libmuscle/manager/run_dir.py
@@ -9,20 +9,23 @@ class RunDir:
 
     The directory is laid out as follows:
 
-    <run_dir>/
-        input/
-        1_<name>.ymmsl
-        2_<name>.ymmsl
-        muscle_manager.log
-        muscle_stats.sqlite
-        instances/
-            <instance_name[i]>/
-                run_script.sh
-                <instance_name[i]>.out
-                <instance_name[i]>.err
-                work_dir/
-                snapshots/
-        snapshots/
+    .. code-block::
+
+        <run_dir>/
+            input/
+            1_<name>.ymmsl
+            2_<name>.ymmsl
+            muscle_manager.log
+            muscle_stats.sqlite
+            instances/
+                <instance_name[i]>/
+                    run_script.sh
+                    <instance_name[i]>.out
+                    <instance_name[i]>.err
+                    work_dir/
+                    snapshots/
+            snapshots/
+
     """
     def __init__(self, run_dir: Path) -> None:
         """Create a RunDir managing the given directory.
diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py
index 3883ea9a..d702f061 100644
--- a/libmuscle/python/libmuscle/manager/snapshot_registry.py
+++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py
@@ -129,7 +129,7 @@ def do_consistency_check(
         """Check if the snapshot of the peer is consistent with us.
 
         When the peer snapshot is consistent, adds it to our list of consistent
-        peer snapshots (in :attribute:`consistent_peers`) and vice versa.
+        peer snapshots (in :attr:`consistent_peers`) and vice versa.
 
         Args:
             peer_node: Snapshot of one of our peers

From 04028d2d418964361f8426636ae14a2f46c6951e Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Mon, 16 Jan 2023 23:17:43 +0100
Subject: [PATCH 157/183] Apply srunmpi overlap/overcommit fix

---
 libmuscle/python/libmuscle/manager/qcgpj_instantiator.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py b/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py
index a150904d..4166ec2a 100644
--- a/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py
+++ b/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py
@@ -393,7 +393,8 @@ def _qcg_job_execution_normal(
                 modules=impl.modules,
                 venv=str(impl.virtual_env) if impl.virtual_env else None,
                 wd=str(request.work_dir),
-                model=qcg_execution_model)
+                model=qcg_execution_model,
+                model_opts={'srun_opts': ['--overlap']})
 
     def _with_local_open_mpi(
             self, executable: str, args: List[str], num_processes: int

From e85c55099b697fdc284f72f962d7fbfdae69c60e Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Mon, 16 Jan 2023 23:16:52 +0100
Subject: [PATCH 158/183] Update copyright year

---
 NOTICE              | 3 ++-
 README.rst          | 4 ++--
 docs/source/conf.py | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/NOTICE b/NOTICE
index 2a353538..c5d40022 100644
--- a/NOTICE
+++ b/NOTICE
@@ -1,3 +1,4 @@
 MUSCLE3
 Copyright 2018-2022, Netherlands eScience Center and University of Amsterdam
-Copyright 2022, The ITER Organization
+Copyright 2023, Netherlands eScience Center
+Copyright 2022-2023, The ITER Organization
diff --git a/README.rst b/README.rst
index 27405971..eed265b4 100644
--- a/README.rst
+++ b/README.rst
@@ -40,8 +40,8 @@ Legal
 =====
 
 MUSCLE3 is Copyright 2018-2022 University of Amsterdam and Netherlands eScience
-Center, and Copyright 2022 ITER Organisation. It is licensed under the Apache
-License 2.0.
+Center, Copyright 2023 Netherlands eScience Center, and Copyright 2022-2023 The
+ITER Organisation. It is licensed under the Apache License 2.0.
 
 
 Contributing
diff --git a/docs/source/conf.py b/docs/source/conf.py
index d59b66dc..31602a46 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -69,7 +69,7 @@
 
 # General information about the project.
 project = 'muscle3'
-copyright = '2018-2022 University of Amsterdam and Netherlands eScience Center, 2022 The ITER Organization'
+copyright = '2018-2022 University of Amsterdam and Netherlands eScience Center, 2023 Netherlands eScience Center, and 2022-2023 The ITER Organization'
 author = 'Lourens Veen'
 
 # The version info for the project you're documenting, acts as replacement for

From 22a7022820bf35aad382c3d25a5f1bb9fcc61797 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Mon, 16 Jan 2023 23:26:57 +0100
Subject: [PATCH 159/183] Fix wording in introduction (fixes #104)

---
 docs/source/introduction.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst
index 78997a6f..f0d2849a 100644
--- a/docs/source/introduction.rst
+++ b/docs/source/introduction.rst
@@ -20,7 +20,7 @@ MUSCLE3 is intended to scale from your laptop to the exascale. At the low end,
 it supports a non-distributed but parallel mode in which an entire multiscale
 simulation, including all component implementations and the MMSL configuration,
 is in a single (short) Python file. Next is a distributed mode where the
-manager and component instances are started on multiple nodes in a cluster, and
+manager and component instances run on multiple nodes in a cluster, and
 communicate directly with one another. Beyond that, additional components and
 optimisations are envisioned that would allow scaling to huge machines or
 combinations of multiple machines. Our goal is to make the transitions between

From 1312bf4422e4b5195c3b3940c370c683d09ad34b Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Mon, 16 Jan 2023 23:31:57 +0100
Subject: [PATCH 160/183] Correctly describe data format in examples (#105)

---
 docs/source/examples/cpp/reaction.cpp                         | 4 ++--
 docs/source/examples/cpp/reaction_mpi.cpp                     | 4 ++--
 docs/source/examples/python/checkpointing_reaction.py         | 4 ++--
 docs/source/examples/python/reaction_diffusion.py             | 4 ++--
 docs/source/examples/python/reaction_diffusion_qmc.py         | 4 ++--
 docs/source/examples/python/reaction_no_state_for_next_use.py | 4 ++--
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/docs/source/examples/cpp/reaction.cpp b/docs/source/examples/cpp/reaction.cpp
index 129b9a3e..1a7e2863 100644
--- a/docs/source/examples/cpp/reaction.cpp
+++ b/docs/source/examples/cpp/reaction.cpp
@@ -16,8 +16,8 @@ using ymmsl::Operator;
  */
 void reaction(int argc, char * argv[]) {
     Instance instance(argc, argv, {
-            {Operator::F_INIT, {"initial_state"}},  // list of double
-            {Operator::O_F, {"final_state"}}});     // list of double
+            {Operator::F_INIT, {"initial_state"}},  // 1D Grid
+            {Operator::O_F, {"final_state"}}});     // 1D Grid
 
     while (instance.reuse_instance()) {
 
diff --git a/docs/source/examples/cpp/reaction_mpi.cpp b/docs/source/examples/cpp/reaction_mpi.cpp
index 92077217..3c8b9f8c 100644
--- a/docs/source/examples/cpp/reaction_mpi.cpp
+++ b/docs/source/examples/cpp/reaction_mpi.cpp
@@ -22,8 +22,8 @@ void reaction(int argc, char * argv[]) {
     MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);
 
     Instance instance(argc, argv, {
-            {Operator::F_INIT, {"initial_state"}},  // list of double
-            {Operator::O_F, {"final_state"}}},      // list of double
+            {Operator::F_INIT, {"initial_state"}},  // 1D Grid
+            {Operator::O_F, {"final_state"}}},      // 1D Grid
             MPI_COMM_WORLD, root_rank);
 
     while (instance.reuse_instance()) {
diff --git a/docs/source/examples/python/checkpointing_reaction.py b/docs/source/examples/python/checkpointing_reaction.py
index fbfb1d74..9e8ad876 100644
--- a/docs/source/examples/python/checkpointing_reaction.py
+++ b/docs/source/examples/python/checkpointing_reaction.py
@@ -8,8 +8,8 @@ def reaction() -> None:
     """A simple exponential reaction model on a 1D grid.
     """
     instance = Instance({
-            Operator.F_INIT: ['initial_state'],     # list of float
-            Operator.O_F: ['final_state']},         # list of float
+            Operator.F_INIT: ['initial_state'],     # 1D Grid
+            Operator.O_F: ['final_state']},         # 1D Grid
             USES_CHECKPOINT_API)
 
     while instance.reuse_instance():
diff --git a/docs/source/examples/python/reaction_diffusion.py b/docs/source/examples/python/reaction_diffusion.py
index 75958d5f..b212d38a 100644
--- a/docs/source/examples/python/reaction_diffusion.py
+++ b/docs/source/examples/python/reaction_diffusion.py
@@ -13,8 +13,8 @@ def reaction() -> None:
     """A simple exponential reaction model on a 1D grid.
     """
     instance = Instance({
-            Operator.F_INIT: ['initial_state'],     # list of float
-            Operator.O_F: ['final_state']})         # list of float
+            Operator.F_INIT: ['initial_state'],     # 1D Grid
+            Operator.O_F: ['final_state']})         # 1D Grid
 
     while instance.reuse_instance():
         # F_INIT
diff --git a/docs/source/examples/python/reaction_diffusion_qmc.py b/docs/source/examples/python/reaction_diffusion_qmc.py
index f96de4b1..07f8aba1 100644
--- a/docs/source/examples/python/reaction_diffusion_qmc.py
+++ b/docs/source/examples/python/reaction_diffusion_qmc.py
@@ -14,8 +14,8 @@ def reaction() -> None:
     """A simple exponential reaction model on a 1D grid.
     """
     instance = Instance({
-            Operator.F_INIT: ['initial_state'],     # list of float
-            Operator.O_F: ['final_state']})         # list of float
+            Operator.F_INIT: ['initial_state'],     # 1D Grid
+            Operator.O_F: ['final_state']})         # 1D Grid
 
     while instance.reuse_instance():
         # F_INIT
diff --git a/docs/source/examples/python/reaction_no_state_for_next_use.py b/docs/source/examples/python/reaction_no_state_for_next_use.py
index e0cb4dca..473d99e3 100644
--- a/docs/source/examples/python/reaction_no_state_for_next_use.py
+++ b/docs/source/examples/python/reaction_no_state_for_next_use.py
@@ -8,8 +8,8 @@ def reaction() -> None:
     """A simple exponential reaction model on a 1D grid.
     """
     instance = Instance({
-            Operator.F_INIT: ['initial_state'],     # list of float
-            Operator.O_F: ['final_state']},         # list of float
+            Operator.F_INIT: ['initial_state'],     # 1D Grid
+            Operator.O_F: ['final_state']},         # 1D Grid
             KEEPS_NO_STATE_FOR_NEXT_USE)
 
     while instance.reuse_instance():

From 4fd5ebbde5714a5c70d4777cf3013a9006ba8ed5 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Mon, 16 Jan 2023 23:33:33 +0100
Subject: [PATCH 161/183] Fix code blocks in examples README (#121)

---
 docs/source/examples/README.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/examples/README.rst b/docs/source/examples/README.rst
index 1c824e7c..04867fd2 100644
--- a/docs/source/examples/README.rst
+++ b/docs/source/examples/README.rst
@@ -5,7 +5,7 @@ This directory contains examples for MUSCLE3. Once you've `installed MUSCLE3
 <https://muscle3.readthedocs.io/en/latest/installing.html>`_, you can build them
 by activating the installation and running Make from this directory, like this:
 
-.. code-block: bash
+.. code-block:: bash
 
     examples$ . /path/to/muscle3/bin/muscle3.env
     examples$ make
@@ -20,7 +20,7 @@ have MPI available, then those examples will be built as well.
 
 You can also build for each language separately, using
 
-.. code-block: bash
+.. code-block:: bash
 
     examples$ make python
     examples$ make cpp
@@ -42,7 +42,7 @@ submodels. Finally, you'll want to apply the settings, which are in
 For example, to run the all-Python version of the reaction-diffusion model, you
 can use:
 
-.. code-block: bash
+.. code-block:: bash
 
     examples$ muscle_manager --start-all rd_implementations.ymmsl rd_python.ymmsl rd_settings.ymmsl
 

From 278e07c2f687f0f62817e93cd8116909302ba69d Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Mon, 16 Jan 2023 23:34:46 +0100
Subject: [PATCH 162/183] Remove requires.io badge, it's gone :( (#142)

---
 README.rst | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/README.rst b/README.rst
index eed265b4..504fd99f 100644
--- a/README.rst
+++ b/README.rst
@@ -17,10 +17,6 @@
     :target: https://www.codacy.com/gh/multiscale/muscle3/dashboard
     :alt: Codacy Grade
 
-.. image:: https://requires.io/github/multiscale/muscle3/requirements.svg?branch=develop
-     :target: https://requires.io/github/multiscale/muscle3/requirements/?branch=develop
-     :alt: Requirements Status
-
 .. image:: https://zenodo.org/badge/122876985.svg
    :target: https://zenodo.org/badge/latestdoi/122876985
 

From f633dd7fc36ebcbb90765f59756b5e7461afbe97 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Mon, 16 Jan 2023 23:55:28 +0100
Subject: [PATCH 163/183] Fix PyPI description (#103)

---
 setup.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/setup.py b/setup.py
index 467e2595..40470f97 100644
--- a/setup.py
+++ b/setup.py
@@ -16,10 +16,14 @@
         p for p in find_packages() + find_packages('libmuscle/python')
         if p != 'integration_test']
 
+_long_desc = (_here / 'README.rst').read_text()
+
 setup(
     name='muscle3',
     version=_version,
     description='Version 3 of the MUltiScale Coupling Library and Environment',
+    long_description=_long_desc,
+    long_description_content_type='text/x-rst',
     author='Lourens Veen',
     author_email='l.veen@esciencecenter.nl',
     url='https://github.com/multiscale/muscle3',

From 87021846896a5f473c78ad83a521c9209c8b6e50 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Tue, 17 Jan 2023 11:58:19 +0100
Subject: [PATCH 164/183] Fix -std=f2003 and -Jmod_dump issues on ifort

---
 libmuscle/fortran/build/libmuscle/Makefile       | 14 +++++++++++---
 libmuscle/fortran/build/libmuscle/tests/Makefile |  8 +++++++-
 libmuscle/fortran/build/ymmsl/Makefile           | 13 +++++++++++--
 3 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/libmuscle/fortran/build/libmuscle/Makefile b/libmuscle/fortran/build/libmuscle/Makefile
index 422bd815..0bdc788e 100644
--- a/libmuscle/fortran/build/libmuscle/Makefile
+++ b/libmuscle/fortran/build/libmuscle/Makefile
@@ -33,7 +33,15 @@ LDFLAGS_MPI := -L../ymmsl -lymmsl_fortran
 LDFLAGS_MPI += -L../../../cpp/build/libmuscle -lmuscle_mpi
 LDFLAGS_MPI += -L../../../cpp/build/ymmsl -lymmsl
 
-FFLAGS := -std=f2003 -O3 -I../ymmsl
+FFLAGS := -O3 -I../ymmsl
+ifeq ($(FC), ifort)
+	FFLAGS += -e03
+	MOD_DUMP := -module mod_dump
+else
+	FFLAGS += -std=f2003
+	MOD_DUMP := -Jmod_dump
+endif
+
 
 .PHONY: all
 all: libmuscle
@@ -73,13 +81,13 @@ install: $(installed_sources) $(installed_modules) $(installed_pkg_config_files)
 	$(FC) -c $(FFLAGS) $^ -o $@ $(LDFLAGS)
 
 %.lo: $(srcdir)/%.f90
-	$(FC) -c -fPIC -Jmod_dump $(FFLAGS) $^ -o $@ $(LDFLAGS)
+	$(FC) -c -fPIC $(MOD_DUMP) $(FFLAGS) $^ -o $@ $(LDFLAGS)
 
 %.mo: $(srcdir)/%.f90
 	$(MPIFC) -c $(FFLAGS) $^ -o $@ $(LDFLAGS_MPI)
 
 %.mlo: $(srcdir)/%.f90
-	$(MPIFC) -c -fPIC -Jmod_dump $(FFLAGS) $^ -o $@ $(LDFLAGS_MPI)
+	$(MPIFC) -c -fPIC $(MOD_DUMP) $(FFLAGS) $^ -o $@ $(LDFLAGS_MPI)
 
 %.mod: %.o
 
diff --git a/libmuscle/fortran/build/libmuscle/tests/Makefile b/libmuscle/fortran/build/libmuscle/tests/Makefile
index ebb20c73..f0e3af0d 100644
--- a/libmuscle/fortran/build/libmuscle/tests/Makefile
+++ b/libmuscle/fortran/build/libmuscle/tests/Makefile
@@ -37,7 +37,13 @@ ifeq "$(filter $(MAKECMDGOALS),$(cleantargets))" ""
 
 EXTRA_LINK_DIRS := $(foreach DIR,$(DEP_DIRS),-Wl,-rpath-link,$(DIR)/lib)
 
-FFLAGS += -std=f2003 -g -I. -I.. -I../../ymmsl
+FFLAGS += -g -I. -I.. -I../../ymmsl
+ifeq ($(FC), ifort)
+	FFLAGS += -e03
+else
+	FFLAGS += -std=f2003
+endif
+
 
 CPP_BUILD_DIR := $(CURDIR)/../../../../cpp/build
 
diff --git a/libmuscle/fortran/build/ymmsl/Makefile b/libmuscle/fortran/build/ymmsl/Makefile
index 8d8321a3..fd6f4800 100644
--- a/libmuscle/fortran/build/ymmsl/Makefile
+++ b/libmuscle/fortran/build/ymmsl/Makefile
@@ -14,7 +14,16 @@ installed_libs := $(libs:%=$(PREFIX)/lib/%)
 installed_pkg_config_files := $(pkg_config_files:%=$(PREFIX)/lib/pkgconfig/%)
 
 LDFLAGS := -L../../../cpp/build/ymmsl -lymmsl
-FFLAGS := -std=f2003 -O3
+
+FFLAGS := -O3
+ifeq ($(FC), ifort)
+	FFLAGS += -e03
+	MOD_DUMP := -module mod_dump
+else
+	FFLAGS += -std=f2003
+	MOD_DUMP := -Jmod_dump
+endif
+
 
 .PHONY: all
 all: ymmsl
@@ -43,7 +52,7 @@ install: $(installed_sources) $(installed_modules) $(installed_libs) $(installed
 	$(FC) -c $(FFLAGS) $^ -o $@ $(LDFLAGS)
 
 %.lo: $(srcdir)/%.f90
-	$(FC) -c -fPIC -Jmod_dump $(FFLAGS) $< -o $@ $(LDFLAGS)
+	$(FC) -c -fPIC $(MOD_DUMP) $(FFLAGS) $< -o $@ $(LDFLAGS)
 
 %.mod: %.o
 

From 6d0b5d7669102c2cd09e6a6ee600dca1a51428a7 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Tue, 17 Jan 2023 13:19:36 +0100
Subject: [PATCH 165/183] Test for ifx as well as ifort in build system

---
 libmuscle/fortran/build/libmuscle/Makefile       | 2 +-
 libmuscle/fortran/build/libmuscle/tests/Makefile | 2 +-
 libmuscle/fortran/build/ymmsl/Makefile           | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libmuscle/fortran/build/libmuscle/Makefile b/libmuscle/fortran/build/libmuscle/Makefile
index 0bdc788e..a5d21fb1 100644
--- a/libmuscle/fortran/build/libmuscle/Makefile
+++ b/libmuscle/fortran/build/libmuscle/Makefile
@@ -34,7 +34,7 @@ LDFLAGS_MPI += -L../../../cpp/build/libmuscle -lmuscle_mpi
 LDFLAGS_MPI += -L../../../cpp/build/ymmsl -lymmsl
 
 FFLAGS := -O3 -I../ymmsl
-ifeq ($(FC), ifort)
+ifneq (,$(filter ifort ifx, $(FC)))
 	FFLAGS += -e03
 	MOD_DUMP := -module mod_dump
 else
diff --git a/libmuscle/fortran/build/libmuscle/tests/Makefile b/libmuscle/fortran/build/libmuscle/tests/Makefile
index f0e3af0d..a93d6bae 100644
--- a/libmuscle/fortran/build/libmuscle/tests/Makefile
+++ b/libmuscle/fortran/build/libmuscle/tests/Makefile
@@ -38,7 +38,7 @@ ifeq "$(filter $(MAKECMDGOALS),$(cleantargets))" ""
 EXTRA_LINK_DIRS := $(foreach DIR,$(DEP_DIRS),-Wl,-rpath-link,$(DIR)/lib)
 
 FFLAGS += -g -I. -I.. -I../../ymmsl
-ifeq ($(FC), ifort)
+ifneq (,$(filter ifort ifx, $(FC)))
 	FFLAGS += -e03
 else
 	FFLAGS += -std=f2003
diff --git a/libmuscle/fortran/build/ymmsl/Makefile b/libmuscle/fortran/build/ymmsl/Makefile
index fd6f4800..9b038f0f 100644
--- a/libmuscle/fortran/build/ymmsl/Makefile
+++ b/libmuscle/fortran/build/ymmsl/Makefile
@@ -16,7 +16,7 @@ installed_pkg_config_files := $(pkg_config_files:%=$(PREFIX)/lib/pkgconfig/%)
 LDFLAGS := -L../../../cpp/build/ymmsl -lymmsl
 
 FFLAGS := -O3
-ifeq ($(FC), ifort)
+ifneq (,$(filter ifort ifx, $(FC)))
 	FFLAGS += -e03
 	MOD_DUMP := -module mod_dump
 else

From f7d323bf52c77270f2be14ef1f3985f8b8112b93 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Tue, 17 Jan 2023 15:31:13 +0100
Subject: [PATCH 166/183] Allocate Fortran string return value even on error

---
 libmuscle/fortran/src/libmuscle/libmuscle.f90 | 82 +++++++++++++++++++
 .../fortran/src/libmuscle/libmuscle_mpi.f90   | 82 +++++++++++++++++++
 libmuscle/fortran/src/ymmsl/ymmsl.f90         | 17 ++++
 scripts/api_generator.py                      |  7 ++
 4 files changed, 188 insertions(+)

diff --git a/libmuscle/fortran/src/libmuscle/libmuscle.f90 b/libmuscle/fortran/src/libmuscle/libmuscle.f90
index 7c19c494..23a2ec88 100644
--- a/libmuscle/fortran/src/libmuscle/libmuscle.f90
+++ b/libmuscle/fortran/src/libmuscle/libmuscle.f90
@@ -6018,6 +6018,7 @@ function LIBMUSCLE_DataConstRef_as_logical( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6076,6 +6077,8 @@ function LIBMUSCLE_DataConstRef_as_character( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+                allocate (character(0) :: LIBMUSCLE_DataConstRef_as_character)
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6132,6 +6135,7 @@ function LIBMUSCLE_DataConstRef_as_int( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6184,6 +6188,7 @@ function LIBMUSCLE_DataConstRef_as_int1( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6236,6 +6241,7 @@ function LIBMUSCLE_DataConstRef_as_int2( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6288,6 +6294,7 @@ function LIBMUSCLE_DataConstRef_as_int4( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6340,6 +6347,7 @@ function LIBMUSCLE_DataConstRef_as_int8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6392,6 +6400,7 @@ function LIBMUSCLE_DataConstRef_as_real4( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6444,6 +6453,7 @@ function LIBMUSCLE_DataConstRef_as_real8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6497,6 +6507,7 @@ function LIBMUSCLE_DataConstRef_as_settings( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6555,6 +6566,7 @@ subroutine LIBMUSCLE_DataConstRef_as_byte_array( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6612,6 +6624,7 @@ function LIBMUSCLE_DataConstRef_get_item_by_key( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6668,6 +6681,7 @@ function LIBMUSCLE_DataConstRef_get_item_by_index( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6720,6 +6734,7 @@ function LIBMUSCLE_DataConstRef_num_dims( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6778,6 +6793,7 @@ subroutine LIBMUSCLE_DataConstRef_shape( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -9247,6 +9263,7 @@ function LIBMUSCLE_DataConstRef_has_indexes( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -9308,6 +9325,8 @@ function LIBMUSCLE_DataConstRef_index( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+                allocate (character(0) :: LIBMUSCLE_DataConstRef_index)
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -11451,6 +11470,7 @@ function LIBMUSCLE_Data_as_logical( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -11509,6 +11529,8 @@ function LIBMUSCLE_Data_as_character( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+                allocate (character(0) :: LIBMUSCLE_Data_as_character)
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -11565,6 +11587,7 @@ function LIBMUSCLE_Data_as_int( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -11617,6 +11640,7 @@ function LIBMUSCLE_Data_as_int1( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -11669,6 +11693,7 @@ function LIBMUSCLE_Data_as_int2( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -11721,6 +11746,7 @@ function LIBMUSCLE_Data_as_int4( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -11773,6 +11799,7 @@ function LIBMUSCLE_Data_as_int8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -11825,6 +11852,7 @@ function LIBMUSCLE_Data_as_real4( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -11877,6 +11905,7 @@ function LIBMUSCLE_Data_as_real8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -11930,6 +11959,7 @@ function LIBMUSCLE_Data_as_settings( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -11988,6 +12018,7 @@ subroutine LIBMUSCLE_Data_as_byte_array( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -12045,6 +12076,7 @@ function LIBMUSCLE_Data_get_item_by_key( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -12101,6 +12133,7 @@ function LIBMUSCLE_Data_get_item_by_index( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -12153,6 +12186,7 @@ function LIBMUSCLE_Data_num_dims( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -12211,6 +12245,7 @@ subroutine LIBMUSCLE_Data_shape( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -14680,6 +14715,7 @@ function LIBMUSCLE_Data_has_indexes( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -14741,6 +14777,8 @@ function LIBMUSCLE_Data_index( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+                allocate (character(0) :: LIBMUSCLE_Data_index)
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -14986,6 +15024,7 @@ subroutine LIBMUSCLE_Data_set_item_key_logical( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15041,6 +15080,7 @@ subroutine LIBMUSCLE_Data_set_item_key_character( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15096,6 +15136,7 @@ subroutine LIBMUSCLE_Data_set_item_key_int1( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15151,6 +15192,7 @@ subroutine LIBMUSCLE_Data_set_item_key_int2( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15206,6 +15248,7 @@ subroutine LIBMUSCLE_Data_set_item_key_int4( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15261,6 +15304,7 @@ subroutine LIBMUSCLE_Data_set_item_key_int8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15316,6 +15360,7 @@ subroutine LIBMUSCLE_Data_set_item_key_real4( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15371,6 +15416,7 @@ subroutine LIBMUSCLE_Data_set_item_key_real8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15426,6 +15472,7 @@ subroutine LIBMUSCLE_Data_set_item_key_data( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15481,6 +15528,7 @@ subroutine LIBMUSCLE_Data_set_item_index_logical( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15536,6 +15584,7 @@ subroutine LIBMUSCLE_Data_set_item_index_character( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15591,6 +15640,7 @@ subroutine LIBMUSCLE_Data_set_item_index_int1( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15646,6 +15696,7 @@ subroutine LIBMUSCLE_Data_set_item_index_int2( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15701,6 +15752,7 @@ subroutine LIBMUSCLE_Data_set_item_index_int4( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15756,6 +15808,7 @@ subroutine LIBMUSCLE_Data_set_item_index_int8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15811,6 +15864,7 @@ subroutine LIBMUSCLE_Data_set_item_index_real4( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15866,6 +15920,7 @@ subroutine LIBMUSCLE_Data_set_item_index_real8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15921,6 +15976,7 @@ subroutine LIBMUSCLE_Data_set_item_index_data( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15981,6 +16037,8 @@ function LIBMUSCLE_Data_key( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+                allocate (character(0) :: LIBMUSCLE_Data_key)
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -16041,6 +16099,7 @@ function LIBMUSCLE_Data_value( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -16158,6 +16217,8 @@ function LIBMUSCLE_PortsDescription_get( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+                allocate (character(0) :: LIBMUSCLE_PortsDescription_get)
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -16579,6 +16640,7 @@ function LIBMUSCLE_Instance_is_setting_a_character( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -16635,6 +16697,7 @@ function LIBMUSCLE_Instance_is_setting_a_int8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -16691,6 +16754,7 @@ function LIBMUSCLE_Instance_is_setting_a_real8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -16747,6 +16811,7 @@ function LIBMUSCLE_Instance_is_setting_a_logical( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -16803,6 +16868,7 @@ function LIBMUSCLE_Instance_is_setting_a_real8array( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -16859,6 +16925,7 @@ function LIBMUSCLE_Instance_is_setting_a_real8array2( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -16920,6 +16987,8 @@ function LIBMUSCLE_Instance_get_setting_as_character( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+                allocate (character(0) :: LIBMUSCLE_Instance_get_setting_as_character)
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -16979,6 +17048,7 @@ function LIBMUSCLE_Instance_get_setting_as_int8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17034,6 +17104,7 @@ function LIBMUSCLE_Instance_get_setting_as_real8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17090,6 +17161,7 @@ function LIBMUSCLE_Instance_get_setting_as_logical( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17151,6 +17223,7 @@ subroutine LIBMUSCLE_Instance_get_setting_as_real8array( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17213,6 +17286,7 @@ subroutine LIBMUSCLE_Instance_get_setting_as_real8array2( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17399,6 +17473,7 @@ function LIBMUSCLE_Instance_receive_p( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17458,6 +17533,7 @@ function LIBMUSCLE_Instance_receive_pd( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17517,6 +17593,7 @@ function LIBMUSCLE_Instance_receive_ps( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17579,6 +17656,7 @@ function LIBMUSCLE_Instance_receive_psd( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17635,6 +17713,7 @@ function LIBMUSCLE_Instance_receive_with_settings_p( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17694,6 +17773,7 @@ function LIBMUSCLE_Instance_receive_with_settings_pd( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17753,6 +17833,7 @@ function LIBMUSCLE_Instance_receive_with_settings_ps( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17815,6 +17896,7 @@ function LIBMUSCLE_Instance_receive_with_settings_psd( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
diff --git a/libmuscle/fortran/src/libmuscle/libmuscle_mpi.f90 b/libmuscle/fortran/src/libmuscle/libmuscle_mpi.f90
index 76a9940f..0f917303 100644
--- a/libmuscle/fortran/src/libmuscle/libmuscle_mpi.f90
+++ b/libmuscle/fortran/src/libmuscle/libmuscle_mpi.f90
@@ -6036,6 +6036,7 @@ function LIBMUSCLE_DataConstRef_as_logical( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6094,6 +6095,8 @@ function LIBMUSCLE_DataConstRef_as_character( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+                allocate (character(0) :: LIBMUSCLE_DataConstRef_as_character)
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6150,6 +6153,7 @@ function LIBMUSCLE_DataConstRef_as_int( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6202,6 +6206,7 @@ function LIBMUSCLE_DataConstRef_as_int1( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6254,6 +6259,7 @@ function LIBMUSCLE_DataConstRef_as_int2( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6306,6 +6312,7 @@ function LIBMUSCLE_DataConstRef_as_int4( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6358,6 +6365,7 @@ function LIBMUSCLE_DataConstRef_as_int8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6410,6 +6418,7 @@ function LIBMUSCLE_DataConstRef_as_real4( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6462,6 +6471,7 @@ function LIBMUSCLE_DataConstRef_as_real8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6515,6 +6525,7 @@ function LIBMUSCLE_DataConstRef_as_settings( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6573,6 +6584,7 @@ subroutine LIBMUSCLE_DataConstRef_as_byte_array( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6630,6 +6642,7 @@ function LIBMUSCLE_DataConstRef_get_item_by_key( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6686,6 +6699,7 @@ function LIBMUSCLE_DataConstRef_get_item_by_index( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6738,6 +6752,7 @@ function LIBMUSCLE_DataConstRef_num_dims( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -6796,6 +6811,7 @@ subroutine LIBMUSCLE_DataConstRef_shape( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -9265,6 +9281,7 @@ function LIBMUSCLE_DataConstRef_has_indexes( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -9326,6 +9343,8 @@ function LIBMUSCLE_DataConstRef_index( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+                allocate (character(0) :: LIBMUSCLE_DataConstRef_index)
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -11469,6 +11488,7 @@ function LIBMUSCLE_Data_as_logical( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -11527,6 +11547,8 @@ function LIBMUSCLE_Data_as_character( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+                allocate (character(0) :: LIBMUSCLE_Data_as_character)
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -11583,6 +11605,7 @@ function LIBMUSCLE_Data_as_int( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -11635,6 +11658,7 @@ function LIBMUSCLE_Data_as_int1( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -11687,6 +11711,7 @@ function LIBMUSCLE_Data_as_int2( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -11739,6 +11764,7 @@ function LIBMUSCLE_Data_as_int4( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -11791,6 +11817,7 @@ function LIBMUSCLE_Data_as_int8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -11843,6 +11870,7 @@ function LIBMUSCLE_Data_as_real4( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -11895,6 +11923,7 @@ function LIBMUSCLE_Data_as_real8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -11948,6 +11977,7 @@ function LIBMUSCLE_Data_as_settings( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -12006,6 +12036,7 @@ subroutine LIBMUSCLE_Data_as_byte_array( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -12063,6 +12094,7 @@ function LIBMUSCLE_Data_get_item_by_key( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -12119,6 +12151,7 @@ function LIBMUSCLE_Data_get_item_by_index( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -12171,6 +12204,7 @@ function LIBMUSCLE_Data_num_dims( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -12229,6 +12263,7 @@ subroutine LIBMUSCLE_Data_shape( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -14698,6 +14733,7 @@ function LIBMUSCLE_Data_has_indexes( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -14759,6 +14795,8 @@ function LIBMUSCLE_Data_index( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+                allocate (character(0) :: LIBMUSCLE_Data_index)
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15004,6 +15042,7 @@ subroutine LIBMUSCLE_Data_set_item_key_logical( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15059,6 +15098,7 @@ subroutine LIBMUSCLE_Data_set_item_key_character( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15114,6 +15154,7 @@ subroutine LIBMUSCLE_Data_set_item_key_int1( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15169,6 +15210,7 @@ subroutine LIBMUSCLE_Data_set_item_key_int2( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15224,6 +15266,7 @@ subroutine LIBMUSCLE_Data_set_item_key_int4( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15279,6 +15322,7 @@ subroutine LIBMUSCLE_Data_set_item_key_int8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15334,6 +15378,7 @@ subroutine LIBMUSCLE_Data_set_item_key_real4( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15389,6 +15434,7 @@ subroutine LIBMUSCLE_Data_set_item_key_real8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15444,6 +15490,7 @@ subroutine LIBMUSCLE_Data_set_item_key_data( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15499,6 +15546,7 @@ subroutine LIBMUSCLE_Data_set_item_index_logical( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15554,6 +15602,7 @@ subroutine LIBMUSCLE_Data_set_item_index_character( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15609,6 +15658,7 @@ subroutine LIBMUSCLE_Data_set_item_index_int1( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15664,6 +15714,7 @@ subroutine LIBMUSCLE_Data_set_item_index_int2( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15719,6 +15770,7 @@ subroutine LIBMUSCLE_Data_set_item_index_int4( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15774,6 +15826,7 @@ subroutine LIBMUSCLE_Data_set_item_index_int8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15829,6 +15882,7 @@ subroutine LIBMUSCLE_Data_set_item_index_real4( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15884,6 +15938,7 @@ subroutine LIBMUSCLE_Data_set_item_index_real8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15939,6 +15994,7 @@ subroutine LIBMUSCLE_Data_set_item_index_data( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -15999,6 +16055,8 @@ function LIBMUSCLE_Data_key( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+                allocate (character(0) :: LIBMUSCLE_Data_key)
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -16059,6 +16117,7 @@ function LIBMUSCLE_Data_value( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -16176,6 +16235,8 @@ function LIBMUSCLE_PortsDescription_get( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+                allocate (character(0) :: LIBMUSCLE_PortsDescription_get)
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -16704,6 +16765,7 @@ function LIBMUSCLE_Instance_is_setting_a_character( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -16760,6 +16822,7 @@ function LIBMUSCLE_Instance_is_setting_a_int8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -16816,6 +16879,7 @@ function LIBMUSCLE_Instance_is_setting_a_real8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -16872,6 +16936,7 @@ function LIBMUSCLE_Instance_is_setting_a_logical( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -16928,6 +16993,7 @@ function LIBMUSCLE_Instance_is_setting_a_real8array( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -16984,6 +17050,7 @@ function LIBMUSCLE_Instance_is_setting_a_real8array2( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17045,6 +17112,8 @@ function LIBMUSCLE_Instance_get_setting_as_character( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+                allocate (character(0) :: LIBMUSCLE_Instance_get_setting_as_character)
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17104,6 +17173,7 @@ function LIBMUSCLE_Instance_get_setting_as_int8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17159,6 +17229,7 @@ function LIBMUSCLE_Instance_get_setting_as_real8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17215,6 +17286,7 @@ function LIBMUSCLE_Instance_get_setting_as_logical( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17276,6 +17348,7 @@ subroutine LIBMUSCLE_Instance_get_setting_as_real8array( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17338,6 +17411,7 @@ subroutine LIBMUSCLE_Instance_get_setting_as_real8array2( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17524,6 +17598,7 @@ function LIBMUSCLE_Instance_receive_p( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17583,6 +17658,7 @@ function LIBMUSCLE_Instance_receive_pd( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17642,6 +17718,7 @@ function LIBMUSCLE_Instance_receive_ps( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17704,6 +17781,7 @@ function LIBMUSCLE_Instance_receive_psd( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17760,6 +17838,7 @@ function LIBMUSCLE_Instance_receive_with_settings_p( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17819,6 +17898,7 @@ function LIBMUSCLE_Instance_receive_with_settings_pd( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17878,6 +17958,7 @@ function LIBMUSCLE_Instance_receive_with_settings_ps( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -17940,6 +18021,7 @@ function LIBMUSCLE_Instance_receive_with_settings_psd( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
diff --git a/libmuscle/fortran/src/ymmsl/ymmsl.f90 b/libmuscle/fortran/src/ymmsl/ymmsl.f90
index 07e0db91..9e71bbd6 100644
--- a/libmuscle/fortran/src/ymmsl/ymmsl.f90
+++ b/libmuscle/fortran/src/ymmsl/ymmsl.f90
@@ -644,6 +644,7 @@ function YMMSL_Settings_is_a_character( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -700,6 +701,7 @@ function YMMSL_Settings_is_a_int4( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -756,6 +758,7 @@ function YMMSL_Settings_is_a_int8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -812,6 +815,7 @@ function YMMSL_Settings_is_a_real8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -868,6 +872,7 @@ function YMMSL_Settings_is_a_logical( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -924,6 +929,7 @@ function YMMSL_Settings_is_a_real8array( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -980,6 +986,7 @@ function YMMSL_Settings_is_a_real8array2( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -1146,6 +1153,8 @@ function YMMSL_Settings_get_as_character( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+                allocate (character(0) :: YMMSL_Settings_get_as_character)
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -1205,6 +1214,7 @@ function YMMSL_Settings_get_as_int4( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -1260,6 +1270,7 @@ function YMMSL_Settings_get_as_int8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -1315,6 +1326,7 @@ function YMMSL_Settings_get_as_real8( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -1371,6 +1383,7 @@ function YMMSL_Settings_get_as_logical( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -1432,6 +1445,7 @@ subroutine YMMSL_Settings_get_as_real8array( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -1494,6 +1508,7 @@ subroutine YMMSL_Settings_get_as_real8array2( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
@@ -1598,6 +1613,8 @@ function YMMSL_Settings_key( &
                         err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)
                     end do
                 end if
+                allocate (character(0) :: YMMSL_Settings_key)
+
                 return
             else
                 call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))
diff --git a/scripts/api_generator.py b/scripts/api_generator.py
index e6c4f5a6..fc8a8a10 100644
--- a/scripts/api_generator.py
+++ b/scripts/api_generator.py
@@ -71,6 +71,9 @@ def f_call_c(self, result_name: str, call: str) -> str:
     def f_return_result(self, return_name: str, result_name: str) -> str:
         return '    {} = {}\n'.format(return_name, result_name)
 
+    def f_return_dummy_result(self, return_name: str) -> str:
+        return ''
+
     def _regular_type(self,
                       short_type: Union[str, List[Union[str, Tuple[str, str]]]]
                       ) -> List[Tuple[str, str]]:
@@ -168,6 +171,9 @@ def f_return_result(self, return_name: str, result_name: str) -> str:
                 '        {0}(i_loop:i_loop) = f_ret_ptr(i_loop)\n'
                 '    end do\n').format(return_name)
 
+    def f_return_dummy_result(self, return_name: str) -> str:
+        return '            allocate (character(0) :: {0})\n'.format(return_name)
+
     def fi_type(self) -> str:
         return self._regular_type(
                 ['character',
@@ -1425,6 +1431,7 @@ def fortran_function(self) -> str:
             result += '                    err_msg(err_msg_i:err_msg_i) = err_msg_f(err_msg_i)\n'
             result += '                end do\n'
             result += '            end if\n'
+            result += '{}\n'.format(self.ret_type.f_return_dummy_result(result_name))
             result += '            return\n'
             result += '        else\n'
             result += '            call c_f_pointer(err_msg_v, err_msg_f, (/err_msg_len_v/))\n'

From 716d9665fd7198cf01178d362fd4ddab675a916f Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Tue, 17 Jan 2023 15:31:58 +0100
Subject: [PATCH 167/183] Run Fortran tests under Valgrind, if available

---
 libmuscle/fortran/build/libmuscle/tests/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libmuscle/fortran/build/libmuscle/tests/Makefile b/libmuscle/fortran/build/libmuscle/tests/Makefile
index a93d6bae..66afaf9f 100644
--- a/libmuscle/fortran/build/libmuscle/tests/Makefile
+++ b/libmuscle/fortran/build/libmuscle/tests/Makefile
@@ -79,5 +79,5 @@ test_dep_lib_paths := $(subst $(space),:,$(foreach DIR,$(DEP_DIRS),$(DIR)/lib))
 
 .PHONY: run_test%
 run_test%: test%
-	export LD_LIBRARY_PATH=$(test_dep_lib_paths):$(LD_LIBRARY_PATH) ; ./$<
+	export LD_LIBRARY_PATH=$(test_dep_lib_paths):$(LD_LIBRARY_PATH) ; $(VALGRIND) ./$<
 

From b583ccba526122ffa21d0d55f6934c772d5fcabe Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Tue, 17 Jan 2023 15:32:21 +0100
Subject: [PATCH 168/183] Build apigen tests with correct Fortran compiler and
 use Valgrind

---
 scripts/tests/Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/tests/Makefile b/scripts/tests/Makefile
index 29d53fad..69ff691e 100644
--- a/scripts/tests/Makefile
+++ b/scripts/tests/Makefile
@@ -1,7 +1,7 @@
 all: test_use_echolib
 
 CXX?=g++
-FORTRAN?=gfortran
+FC?=gfortran
 
 
 .PHONY: clean
@@ -22,7 +22,7 @@ libecholib.so: echolib.hpp echolib.cpp echolib_fortran_c.cpp cmdlineargs.cpp
 	$(CXX) -g -o $@ -std=c++14 -pedantic -I. -fPIC -shared echolib.cpp cmdlineargs.cpp echolib_fortran_c.cpp
 
 test_use_echolib: libecholib.so echolib.f90 test_use_echolib.f90
-	$(FORTRAN) -g -std=f2003 -o $@ echolib.f90 test_use_echolib.f90 -L. -lecholib
+	$(FC) -g -std=f2003 -o $@ echolib.f90 test_use_echolib.f90 -L. -lecholib
 
 test: test_use_echolib
-	LD_LIBRARY_PATH=$(PWD):$(LD_LIBRARY_PATH) ./test_use_echolib
+	LD_LIBRARY_PATH=$(PWD):$(LD_LIBRARY_PATH) $(VALGRIND) ./test_use_echolib

From 49318e47bbd67b96a90c153830e6ab01310f2f7a Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Tue, 17 Jan 2023 17:16:31 +0100
Subject: [PATCH 169/183] Allow passing a longer array to DCF_shape

---
 libmuscle/fortran/src/libmuscle/libmuscle.f90     | 4 ++--
 libmuscle/fortran/src/libmuscle/libmuscle_mpi.f90 | 4 ++--
 scripts/api_generator.py                          | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/libmuscle/fortran/src/libmuscle/libmuscle.f90 b/libmuscle/fortran/src/libmuscle/libmuscle.f90
index 23a2ec88..3241020e 100644
--- a/libmuscle/fortran/src/libmuscle/libmuscle.f90
+++ b/libmuscle/fortran/src/libmuscle/libmuscle.f90
@@ -6811,7 +6811,7 @@ subroutine LIBMUSCLE_DataConstRef_shape( &
         end if
 
         call c_f_pointer(ret_val, f_ret_ptr, (/ret_val_size/))
-        shp = f_ret_ptr
+        shp(1:ret_val_size) = f_ret_ptr
     end subroutine LIBMUSCLE_DataConstRef_shape
 
     subroutine LIBMUSCLE_DataConstRef_elements_1_logical( &
@@ -12263,7 +12263,7 @@ subroutine LIBMUSCLE_Data_shape( &
         end if
 
         call c_f_pointer(ret_val, f_ret_ptr, (/ret_val_size/))
-        shp = f_ret_ptr
+        shp(1:ret_val_size) = f_ret_ptr
     end subroutine LIBMUSCLE_Data_shape
 
     subroutine LIBMUSCLE_Data_elements_1_logical( &
diff --git a/libmuscle/fortran/src/libmuscle/libmuscle_mpi.f90 b/libmuscle/fortran/src/libmuscle/libmuscle_mpi.f90
index 0f917303..2e587eb4 100644
--- a/libmuscle/fortran/src/libmuscle/libmuscle_mpi.f90
+++ b/libmuscle/fortran/src/libmuscle/libmuscle_mpi.f90
@@ -6829,7 +6829,7 @@ subroutine LIBMUSCLE_DataConstRef_shape( &
         end if
 
         call c_f_pointer(ret_val, f_ret_ptr, (/ret_val_size/))
-        shp = f_ret_ptr
+        shp(1:ret_val_size) = f_ret_ptr
     end subroutine LIBMUSCLE_DataConstRef_shape
 
     subroutine LIBMUSCLE_DataConstRef_elements_1_logical( &
@@ -12281,7 +12281,7 @@ subroutine LIBMUSCLE_Data_shape( &
         end if
 
         call c_f_pointer(ret_val, f_ret_ptr, (/ret_val_size/))
-        shp = f_ret_ptr
+        shp(1:ret_val_size) = f_ret_ptr
     end subroutine LIBMUSCLE_Data_shape
 
     subroutine LIBMUSCLE_Data_elements_1_logical( &
diff --git a/scripts/api_generator.py b/scripts/api_generator.py
index fc8a8a10..f26e38ee 100644
--- a/scripts/api_generator.py
+++ b/scripts/api_generator.py
@@ -400,7 +400,7 @@ def f_call_c(self, result_name: str, call: str) -> str:
 
     def f_return_result(self, return_name: str, result_name: str) -> str:
         return ('    call c_f_pointer(ret_val, f_ret_ptr, (/ret_val_size/))\n'
-                '    {} = f_ret_ptr\n').format(return_name)
+                '    {}(1:ret_val_size) = f_ret_ptr\n').format(return_name)
 
     def fi_type(self) -> str:
         return self._regular_type(

From 2b2539068867872f51f46666acf26dc773982e05 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Tue, 17 Jan 2023 17:17:05 +0100
Subject: [PATCH 170/183] Fix memory leak in Fortran test

---
 libmuscle/fortran/src/libmuscle/tests/test_data.f90 | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libmuscle/fortran/src/libmuscle/tests/test_data.f90 b/libmuscle/fortran/src/libmuscle/tests/test_data.f90
index 6cb22ff1..51b16098 100644
--- a/libmuscle/fortran/src/libmuscle/tests/test_data.f90
+++ b/libmuscle/fortran/src/libmuscle/tests/test_data.f90
@@ -157,6 +157,7 @@ subroutine test_data_settings
         call YMMSL_Settings_free(s1)
         call YMMSL_Settings_free(s2)
         call LIBMUSCLE_Data_free(d1)
+        call LIBMUSCLE_Data_free(d2)
         print *, '[       OK ] data.settings'
     end subroutine test_data_settings
 

From 70d237423a70e0e3812fc187fb96fc72e46f203a Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Tue, 17 Jan 2023 18:08:16 +0100
Subject: [PATCH 171/183] Fix parallel build of Fortran tests

---
 libmuscle/fortran/src/libmuscle/tests/test_data.f90         | 6 +++---
 libmuscle/fortran/src/libmuscle/tests/test_message.f90      | 6 +++---
 libmuscle/fortran/src/libmuscle/tests/test_operator.f90     | 6 +++---
 .../fortran/src/libmuscle/tests/test_ports_description.f90  | 6 +++---
 libmuscle/fortran/src/libmuscle/tests/test_settings.f90     | 6 +++---
 5 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/libmuscle/fortran/src/libmuscle/tests/test_data.f90 b/libmuscle/fortran/src/libmuscle/tests/test_data.f90
index 51b16098..7a27606a 100644
--- a/libmuscle/fortran/src/libmuscle/tests/test_data.f90
+++ b/libmuscle/fortran/src/libmuscle/tests/test_data.f90
@@ -1,4 +1,4 @@
-module tests
+module data_tests
     use assert
     implicit none
 contains
@@ -505,10 +505,10 @@ subroutine test_data_byte_array
 
         print *, '[       OK ] data.byte_array'
     end subroutine test_data_byte_array
-end module tests
+end module data_tests
 
 program test_data
-    use tests
+    use data_tests
     implicit none
 
     print *, ''
diff --git a/libmuscle/fortran/src/libmuscle/tests/test_message.f90 b/libmuscle/fortran/src/libmuscle/tests/test_message.f90
index 43ca94cc..59f60768 100644
--- a/libmuscle/fortran/src/libmuscle/tests/test_message.f90
+++ b/libmuscle/fortran/src/libmuscle/tests/test_message.f90
@@ -1,4 +1,4 @@
-module tests
+module message_tests
     use assert
     implicit none
 contains
@@ -141,10 +141,10 @@ subroutine test_message_settings
         call LIBMUSCLE_Message_free(m1)
         print *, '[       OK ] message.settings'
     end subroutine test_message_settings
-end module tests
+end module message_tests
 
 program test_message
-    use tests
+    use message_tests
     implicit none
 
     print *, ''
diff --git a/libmuscle/fortran/src/libmuscle/tests/test_operator.f90 b/libmuscle/fortran/src/libmuscle/tests/test_operator.f90
index 42fa9b30..be7ce72c 100644
--- a/libmuscle/fortran/src/libmuscle/tests/test_operator.f90
+++ b/libmuscle/fortran/src/libmuscle/tests/test_operator.f90
@@ -1,4 +1,4 @@
-module tests
+module operator_tests
     use assert
     implicit none
 contains
@@ -16,10 +16,10 @@ subroutine test_operator_use
         o1 = YMMSL_Operator_O_F
         print *, '[       OK ] operator.use'
     end subroutine test_operator_use
-end module tests
+end module operator_tests
 
 program test_operator
-    use tests
+    use operator_tests
     implicit none
 
     print *, ''
diff --git a/libmuscle/fortran/src/libmuscle/tests/test_ports_description.f90 b/libmuscle/fortran/src/libmuscle/tests/test_ports_description.f90
index e53f1caa..02ebcb3a 100644
--- a/libmuscle/fortran/src/libmuscle/tests/test_ports_description.f90
+++ b/libmuscle/fortran/src/libmuscle/tests/test_ports_description.f90
@@ -1,4 +1,4 @@
-module tests
+module ports_description_tests
     use assert
     implicit none
 contains
@@ -18,10 +18,10 @@ subroutine test_ports_description_use
         call LIBMUSCLE_PortsDescription_free(pd)
         print *, '[       OK ] ports_description.use'
     end subroutine test_ports_description_use
-end module tests
+end module ports_description_tests
 
 program test_ports_description
-    use tests
+    use ports_description_tests
     implicit none
 
     print *, ''
diff --git a/libmuscle/fortran/src/libmuscle/tests/test_settings.f90 b/libmuscle/fortran/src/libmuscle/tests/test_settings.f90
index 38919fdb..589fe1fb 100644
--- a/libmuscle/fortran/src/libmuscle/tests/test_settings.f90
+++ b/libmuscle/fortran/src/libmuscle/tests/test_settings.f90
@@ -1,4 +1,4 @@
-module tests
+module settings_tests
     use assert
     implicit none
 contains
@@ -218,10 +218,10 @@ subroutine test_settings_key
         call YMMSL_Settings_free(s1)
         print *, '[       OK ] settings.key'
     end subroutine test_settings_key
-end module tests
+end module settings_tests
 
 program test_settings
-    use tests
+    use settings_tests
     implicit none
 
     print *, ''

From 5ce79d9a757bf9670f720dae118e05d5f0263f9d Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Tue, 17 Jan 2023 18:36:46 +0100
Subject: [PATCH 172/183] Make integration tests work with Intel MPI

---
 integration_test/conftest.py             | 26 ++++++++++++++++++++++++
 integration_test/test_mpi_macro_micro.py |  5 +++--
 integration_test/test_start_mpi.py       |  7 ++++---
 3 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/integration_test/conftest.py b/integration_test/conftest.py
index a6c70b1e..210659b7 100644
--- a/integration_test/conftest.py
+++ b/integration_test/conftest.py
@@ -210,3 +210,29 @@ def log_file_in_tmpdir(tmpdir):
     yield None
 
     os.chdir(old_workdir)
+
+
+@pytest.fixture
+def mpi_is_intel():
+    if 'MUSCLE_ENABLE_CPP_MPI' not in os.environ:
+        return None
+
+    result = subprocess.run(
+            ['mpirun', '--version'], capture_output=True, check=True)
+    return 'Intel' in result.stdout.decode('utf-8')
+
+
+@pytest.fixture
+def mpirun_outfile_arg(mpi_is_intel):
+    if mpi_is_intel:
+        return '-outfile-pattern'
+    else:
+        return '--output-filename'
+
+
+@pytest.fixture
+def mpi_exec_model(mpi_is_intel):
+    if mpi_is_intel:
+        return 'intelmpi'
+    else:
+        return 'openmpi'
diff --git a/integration_test/test_mpi_macro_micro.py b/integration_test/test_mpi_macro_micro.py
index 2b02fa76..2e9eb39a 100644
--- a/integration_test/test_mpi_macro_micro.py
+++ b/integration_test/test_mpi_macro_micro.py
@@ -37,7 +37,8 @@ def macro():
 
 
 @skip_if_python_only
-def test_mpi_macro_micro(tmpdir, mmp_server_process_simple):
+def test_mpi_macro_micro(
+        tmpdir, mmp_server_process_simple, mpirun_outfile_arg):
     # only run this if MPI is enabled
     if 'MUSCLE_ENABLE_CPP_MPI' not in os.environ:
         pytest.skip('MPI support was not detected')
@@ -58,7 +59,7 @@ def test_mpi_macro_micro(tmpdir, mmp_server_process_simple):
     mpi_test_micro = cpp_test_dir / 'mpi_micro_model_test'
     out_file = tmpdir + '/mpi_micro.log'
     micro_result = subprocess.Popen(
-            ['mpirun', '-np', '2', '--output-filename', out_file,
+            ['mpirun', '-np', '2', mpirun_outfile_arg, out_file,
              str(mpi_test_micro), '--muscle-instance=micro'], env=env)
 
     # run macro model
diff --git a/integration_test/test_start_mpi.py b/integration_test/test_start_mpi.py
index dca5c9d2..2ba3d522 100644
--- a/integration_test/test_start_mpi.py
+++ b/integration_test/test_start_mpi.py
@@ -11,7 +11,7 @@
 
 
 @skip_if_python_only
-def test_start_mpi(tmpdir):
+def test_start_mpi(tmpdir, mpi_exec_model):
     # only run this if MPI is enabled
     if 'MUSCLE_ENABLE_CPP_MPI' not in os.environ:
         pytest.skip('MPI support was not detected')
@@ -55,14 +55,15 @@ def test_start_mpi(tmpdir):
             '    env:\n'
             '      +LD_LIBRARY_PATH: :{}\n'
             '    executable: {}\n'
-            '    execution_model: openmpi\n'
+            '    execution_model: {}\n'
             'resources:\n'
             '  macro:\n'
             '    threads: 1\n'
             '  micro:\n'
             '    mpi_processes: 2\n'
             ).format(
-                ld_lib_path, test_component, ld_lib_path, mpi_test_component))
+                ld_lib_path, test_component, ld_lib_path, mpi_test_component,
+                mpi_exec_model))
 
     config = ymmsl.load(ymmsl_text)
 

From 0e82a697226ba3e004e55060cfa9721dc78ec1d2 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Tue, 17 Jan 2023 18:37:19 +0100
Subject: [PATCH 173/183] Make script tests work better with Intel compilers

---
 scripts/tests/Makefile    | 8 +++++++-
 scripts/tests/echolib.cpp | 2 +-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/scripts/tests/Makefile b/scripts/tests/Makefile
index 69ff691e..f0b75a00 100644
--- a/scripts/tests/Makefile
+++ b/scripts/tests/Makefile
@@ -3,6 +3,12 @@ all: test_use_echolib
 CXX?=g++
 FC?=gfortran
 
+ifneq (,$(filter ifort ifx, $(FC)))
+	FFLAGS += -e03
+else
+	FFLAGS += -std=f2003
+endif
+
 
 .PHONY: clean
 clean:
@@ -22,7 +28,7 @@ libecholib.so: echolib.hpp echolib.cpp echolib_fortran_c.cpp cmdlineargs.cpp
 	$(CXX) -g -o $@ -std=c++14 -pedantic -I. -fPIC -shared echolib.cpp cmdlineargs.cpp echolib_fortran_c.cpp
 
 test_use_echolib: libecholib.so echolib.f90 test_use_echolib.f90
-	$(FC) -g -std=f2003 -o $@ echolib.f90 test_use_echolib.f90 -L. -lecholib
+	$(FC) -g $(FFLAGS) -o $@ echolib.f90 test_use_echolib.f90 -L. -lecholib
 
 test: test_use_echolib
 	LD_LIBRARY_PATH=$(PWD):$(LD_LIBRARY_PATH) $(VALGRIND) ./test_use_echolib
diff --git a/scripts/tests/echolib.cpp b/scripts/tests/echolib.cpp
index e357a4a5..a9b1ded1 100644
--- a/scripts/tests/echolib.cpp
+++ b/scripts/tests/echolib.cpp
@@ -30,7 +30,7 @@ double Echo::echo_double(double value) const {
 }
 
 bool Echo::echo_bool(bool value) const {
-    assert(value == true);
+    assert(value);
     return value;
 }
 

From df91ac226d58f2e71d23e4667735ee30ecf69130 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Tue, 17 Jan 2023 18:59:57 +0100
Subject: [PATCH 174/183] Improve example build on Intel compiler

---
 docs/source/examples/fortran/build/Makefile | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/docs/source/examples/fortran/build/Makefile b/docs/source/examples/fortran/build/Makefile
index 233ce47f..2d859c3f 100644
--- a/docs/source/examples/fortran/build/Makefile
+++ b/docs/source/examples/fortran/build/Makefile
@@ -1,9 +1,21 @@
-FFLAGS += -std=f2003 -g $(shell pkg-config --cflags libmuscle_fortran ymmsl_fortran)
+FFLAGS += -g $(shell pkg-config --cflags libmuscle_fortran ymmsl_fortran)
 LDFLAGS := $(shell pkg-config --libs libmuscle_fortran ymmsl_fortran)
 
-MPI_FFLAGS := -std=f2003 -g $(shell pkg-config --cflags libmuscle_mpi_fortran ymmsl_fortran)
+MPI_FFLAGS := -g $(shell pkg-config --cflags libmuscle_mpi_fortran ymmsl_fortran)
 MPI_LDFLAGS := $(shell pkg-config --libs libmuscle_mpi_fortran ymmsl_fortran)
 
+ifneq (,$(filter ifort ifx, $(FC)))
+	FFLAGS += -e03
+else
+	FFLAGS += -std=f2003
+endif
+
+ifneq (,$(filter mpiifort, $(MPIFC)))
+	MPI_FFLAGS += -e03
+else
+	MPI_FFLAGS += -std=f2003
+endif
+
 
 binaries := reaction diffusion mc_driver load_balancer
 mpi_binaries := reaction_mpi

From 7a235fdf61cc326617cb25f76030bb81fe6fbaed Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Tue, 17 Jan 2023 19:05:13 +0100
Subject: [PATCH 175/183] Add Intel compatibility run to CI

---
 .github/workflows/ci_ubuntu20.04_intel.yaml | 22 +++++++++++++++++++++
 .github/workflows/ci_ubuntu22.04_intel.yaml | 20 +++++++++++++++++++
 2 files changed, 42 insertions(+)
 create mode 100644 .github/workflows/ci_ubuntu20.04_intel.yaml
 create mode 100644 .github/workflows/ci_ubuntu22.04_intel.yaml

diff --git a/.github/workflows/ci_ubuntu20.04_intel.yaml b/.github/workflows/ci_ubuntu20.04_intel.yaml
new file mode 100644
index 00000000..c39874c8
--- /dev/null
+++ b/.github/workflows/ci_ubuntu20.04_intel.yaml
@@ -0,0 +1,22 @@
+# Run Continuous Integration for the latest Ubuntu release
+# This mainly checks for issues/regressions in the native build
+# Note that this is not just ubuntu 20.04, but also the oldest currently
+# available version of the Intel compiler.
+name: native_compatibility_ubuntu20.04_intel
+on:
+  schedule:
+    - cron: '30 5 * * 0'
+  push:
+    branches:
+      - 'release-*'
+      - fix_native_compatibility_ci
+      - issue-25-intel-compiler-support
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Run tests on Ubuntu 20.04 with Clang
+      run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:20.04 /bin/bash -c 'apt-get update && apt-get -y install wget && wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB && mv GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB /etc/apt/trusted.gpg.d/intel-sw-products.asc && echo "deb https://apt.repos.intel.com/oneapi all main" >/etc/apt/sources.list.d/oneAPI.list && apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential cmake git valgrind pkg-config python3 python3-pip python3-venv curl intel-oneapi-compiler-dpcpp-cpp-2021.1.1 intel-oneapi-compiler-fortran-2021.1.1 intel-oneapi-mpi-devel-2021.1.1 && apt-get -y remove libssl-dev zlib1g-dev && useradd -m -d /home/muscle3 muscle3 && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && su muscle3 -c -- "pip3 install --user -U \"pip<22\" setuptools wheel" && su muscle3 -c -- "pip3 install --user \"ymmsl>=0.12.0,<0.13\" qcg-pilotjob==0.13.1" && su muscle3 -c -s /bin/bash -- "cd /home/muscle3/muscle3 && . /opt/intel/oneapi/setvars.sh && CXX=icpx MPICXX=\"mpiicpc -cxx=icpx\" FC=ifx MPIFC=\"mpiifort -fc=ifx\" make test_examples"'
diff --git a/.github/workflows/ci_ubuntu22.04_intel.yaml b/.github/workflows/ci_ubuntu22.04_intel.yaml
new file mode 100644
index 00000000..b1c11cc5
--- /dev/null
+++ b/.github/workflows/ci_ubuntu22.04_intel.yaml
@@ -0,0 +1,20 @@
+# Run Continuous Integration for the latest Ubuntu release
+# This mainly checks for issues/regressions in the native build
+name: native_compatibility_ubuntu22.04_intel
+on:
+  schedule:
+    - cron: '0 5 * * 0'
+  push:
+    branches:
+      - 'release-*'
+      - fix_native_compatibility_ci
+      - issue-25-intel-compiler-support
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+
+    - name: Run tests on Ubuntu 22.04 with the Intel compiler
+      run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:22.04 /bin/bash -c 'apt-get update && apt-get -y install wget && wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB && mv GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB /etc/apt/trusted.gpg.d/intel-sw-products.asc && echo "deb https://apt.repos.intel.com/oneapi all main" >/etc/apt/sources.list.d/oneAPI.list && apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential cmake git valgrind pkg-config python3 python3-pip python3-venv curl intel-oneapi-compiler-dpcpp-cpp intel-oneapi-compiler-fortran intel-oneapi-mpi-devel && apt-get -y remove libssl-dev && useradd -m -d /home/muscle3 muscle3 && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && su muscle3 -c -- "pip3 install -U pip setuptools wheel" && su muscle3 -c -- "pip3 install \"ymmsl>=0.12.0,<0.13\" qcg-pilotjob==0.13.1" && su muscle3 -c -- "cd /home/muscle3/muscle3 && . /opt/intel/oneapi/setvars.sh && MPICXX=\"mpiicpc -cxx=icpx\" CXX=icpx MPIFC=\"mpiifort -fc=ifx\" FC=ifx make test_examples"'

From d7695935dc09a9cb515e7d357fb50973e942c757 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Tue, 17 Jan 2023 20:48:34 +0100
Subject: [PATCH 176/183] Add documentation on building with the Intel compiler

---
 docs/source/installing.rst.in | 92 +++++++++++++++++++----------------
 1 file changed, 49 insertions(+), 43 deletions(-)

diff --git a/docs/source/installing.rst.in b/docs/source/installing.rst.in
index 25620d03..adc7a399 100644
--- a/docs/source/installing.rst.in
+++ b/docs/source/installing.rst.in
@@ -69,11 +69,10 @@ Prerequisites
 
 To build libmuscle, we're going to need some tools. In particular, we need a C++
 compiler and GNU make. MUSCLE3 uses C++14, so you need at least g++ 4.9.3.
-Clang is expected to work, but that's not been tested. Using the Intel toolchain
-currently does not work, but see below for information about building submodels
-with the Intel tools.  Other compilers have not been tested. If you want to try,
-go right ahead, we'd love to have feedback on this. Building has been tested
-with gmake 3.82 and 4.1.
+Clang is also supported, as are the proprietary Intel® [#f1]_ compilers; see the
+instructions below for how to use them. Other compilers have not been tested.
+If you want to try, go right ahead, we'd love to have feedback on this. Building
+has been tested with gmake 3.82 and 4.1.
 
 If you're doing C++ development on a reasonably modern Linux, then you probably
 already have a suitable compiler installed. If not, on a Debian (or Ubuntu)
@@ -92,13 +91,20 @@ build command a bit, see below. Note that clang does not have a production-ready
 Fortran compiler yet, but the commands below will help you build the C++ part
 with clang, and the Fortran part with gfortran.
 
+For Intel®, you will have to install the Intel® oneAPI according to the
+instructions on the Intel® website. If you're on Ubuntu and you chose to use the
+repository, then you need at least ``intel-oneapi-compiler-dpcpp-cpp``
+and ``intel-oneapi-compiler-fortran``.
+
 If your submodels use MPI, then you'll need to compile the MPI support for
 MUSCLE3. This requires an MPI library to be available. Libmuscle has been
 tested with OpenMPI on Ubuntu, but should work with other MPI implementations
 (this being the point of the MPI standard). On Debian/Ubuntu, ``sudo apt-get
 install libopenmpi-dev`` will install the OpenMPI development files needed to
 compile libmuscle C++ with MPI support. On a cluster, there is probably a
-``module load openmpi`` or similar command to make MPI available. MUSCLE3 only
+``module load openmpi`` or similar command to make MPI available. If you are
+using the Intel® compiler, then you'll probably want to use Intel® MPI as well,
+which you can get from the ``intel-oneapi-mpi-devel`` package. MUSCLE3 only
 uses very basic MPI functionality, so any version should do. MUSCLE3 will
 automatically detect the availability of MPI, and build the MPI version of the
 library if MPI is available.
@@ -215,7 +221,7 @@ To build with clang, use
 
 .. code-block:: bash
 
-  ~/mucle3_source/muscle3-0.5.0$ CXXFLAGS=-fPIE OMPI_CXX=clang++ CXX=clang++ make
+  ~/mucle3_source/muscle3-%%VERSION%%$ CXXFLAGS=-fPIE OMPI_CXX=clang++ CXX=clang++ make
 
 
 This will tell the build system to use clang for compiling the C++ code and its
@@ -223,6 +229,20 @@ MPI support, but still use gfortran to compile the Fortran code (if gfortran is
 installed). The extra ``-fPIE`` switch is needed to make that combination work
 on some common platforms.
 
+**Building with Intel®**
+
+To build with the Intel® compiler, use
+
+.. code-block:: bash
+
+  ~/muscle3_source/muscle3-%%VERSION%%$ CXX=icpx MPICXX='mpiicpc -cxx=icpx' FC=ifx MPIFC='mpiifort -fc=ifx' make
+
+
+This will tell the build system to use the Intel® C++ and Fortran compilers to
+compile MUSCLE3. Note that the Classic compilers (``icc`` and ``ifort``) seem to
+work for the library but not for the test suite. As they are obsolete, this will
+not be fixed.
+
 
 Getting help
 ````````````
@@ -405,29 +425,14 @@ command:
 If you have just installed MUSCLE3, then the above bits are currently on your
 screen, so you can just copy-paste them from there.
 
-Intel C++ compiler
+Intel® C++ compiler
 ``````````````````
 
-Compiling MUSCLE3 with the Intel compiler is currently not supported.
-Fortunately, in a typical multiscale simulation, only a small amount of time is
-spent communicating through MUSCLE3. Your submodels will spend most of their
-time either computing or waiting. Therefore, it helps to compile the submodels
-with the Intel compiler for some extra performance, but using the Intel compiler
-for MUSCLE3 doesn't add much anyway.
-
-To compile your submodels with the Intel compiler, first use the GNU compiler
-to install MUSCLE3. Then, switch to the Intel compiler, and use it to compile
-and link it to MUSCLE3 as described above. The compilers are link-compatible,
-so this should work. (See below if your model is written in Fortran, you need
-to do it slightly differently there.)
-
-Intel has recently made it a bit easier to install their proprietary compilers,
-which should allow us to integrated it into the Continuous Integration system.
-That should enable support for it in MUSCLE3, and since many people want to use
-the Intel compilers (especially for compiling Fortran codes) we plan to do so in
-a future release. If this interests you, please add a comment to the `issue on
-Intel compiler support <https://github.com/multiscale/muscle3/issues/25>`_ so we
-know.
+You can compile MUSCLE3 with the Intel® compiler as described above, and then
+use it for your model as well. It is also possible to compile MUSCLE3 with the
+GNU compiler and your model with the Intel® compiler, and link them together.
+(See below if your model is written in Fortran, you need to do it slightly
+differently there.)
 
 
 Fortran
@@ -566,15 +571,18 @@ command:
 If you have just installed MUSCLE3, then the above bits are currently on your
 screen, so you can just copy-paste them from there.
 
-Intel Fortran compiler
-``````````````````````
+Intel® Fortran compiler
+```````````````````````
 
-MUSCLE3 cannot currently be compiled with the Intel toolchain (see above under
-C++ for details). You can however compile your submodel with the Intel compiler
-and link it to the GNU-compiled MUSCLE3 library, because the compilers are
-almost compatible. The one issue is that the ``.mod`` files created by the GNU
-compiler (and installed when you install ``libmuscle``) cannot be read by the
-Intel compiler.
+MUSCLE3 can be compiled with the Intel® compilers as described above, after
+which you can compile your model with the Intel® Fortran compiler as well and
+link with libmuscle.
+
+Alternatively, you can compile MUSCLE3 with the GNU compilers, and then compile
+your model with the Intel® compilers and link it to the GNU-compiled MUSCLE3
+library, because the compilers are almost compatible. The one issue is that the
+``.mod`` files created by the GNU compiler (and installed when you install
+``libmuscle``) cannot be read by the Intel® compiler.
 
 To solve this, you need to use the corresponding ``.f90`` files instead. These
 are installed by in ``<PREFIX>/include``, and are called ``libmuscle.f90``,
@@ -582,11 +590,9 @@ are installed by in ``<PREFIX>/include``, and are called ``libmuscle.f90``,
 other source file in your submodel, and then link them with rest of the submodel
 and the shared library as described above.
 
-Intel has recently made it a bit easier to install their proprietary compilers,
-which should allow us to integrated it into the Continuous Integration system.
-That should enable support for it in MUSCLE3, and since many people want to use
-the Intel compilers (especially for compiling Fortran codes) we plan to do so in
-a future release. If this interests you, please add a comment to the `issue on
-Intel compiler support <https://github.com/multiscale/muscle3/issues/25>`_ so we
-know.
+
+.. rubric:: Footnotes
+
+.. [#f1] Intel and the Intel logo are trademarks of Intel Corporation or its
+         subsidiaries.
 

From 6ab5e855e93d2508e56006e2169326fea0d3ad4c Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Tue, 17 Jan 2023 21:09:57 +0100
Subject: [PATCH 177/183] Fix PostOffice virtual destructor warning

---
 libmuscle/cpp/src/libmuscle/mcp/transport_server.cpp | 3 +++
 libmuscle/cpp/src/libmuscle/mcp/transport_server.hpp | 3 +++
 libmuscle/cpp/src/libmuscle/post_office.hpp          | 2 +-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/libmuscle/cpp/src/libmuscle/mcp/transport_server.cpp b/libmuscle/cpp/src/libmuscle/mcp/transport_server.cpp
index eecfa87c..56dfcf56 100644
--- a/libmuscle/cpp/src/libmuscle/mcp/transport_server.cpp
+++ b/libmuscle/cpp/src/libmuscle/mcp/transport_server.cpp
@@ -3,6 +3,9 @@
 
 namespace libmuscle { namespace impl {namespace mcp {
 
+
+RequestHandler::~RequestHandler() {}
+
 TransportServer::TransportServer(RequestHandler & handler)
     : handler_(handler)
 {}
diff --git a/libmuscle/cpp/src/libmuscle/mcp/transport_server.hpp b/libmuscle/cpp/src/libmuscle/mcp/transport_server.hpp
index 32ada05d..38b612c0 100644
--- a/libmuscle/cpp/src/libmuscle/mcp/transport_server.hpp
+++ b/libmuscle/cpp/src/libmuscle/mcp/transport_server.hpp
@@ -18,6 +18,9 @@ namespace libmuscle { namespace impl { namespace mcp {
  */
 class RequestHandler {
     public:
+        /** Destruct the object */
+        virtual ~RequestHandler();
+
         /** Handle a request
          *
          * Requests may be handled immediately, or they may be deferred if a
diff --git a/libmuscle/cpp/src/libmuscle/post_office.hpp b/libmuscle/cpp/src/libmuscle/post_office.hpp
index 5f8cc7ab..5ce5c9ad 100644
--- a/libmuscle/cpp/src/libmuscle/post_office.hpp
+++ b/libmuscle/cpp/src/libmuscle/post_office.hpp
@@ -51,7 +51,7 @@ class PostOffice : public mcp::RequestHandler {
 
         /** Destruct a PostOffice.
          */
-        ~PostOffice();
+        virtual ~PostOffice();
 
         /** Handle a request
          *

From 6ae9565d3739bf64a6e0041e789b1f9ed93310e8 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Tue, 17 Jan 2023 21:25:11 +0100
Subject: [PATCH 178/183] Update release docs to reflect new doc dependency
 system

---
 docs/source/releasing.rst | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/docs/source/releasing.rst b/docs/source/releasing.rst
index e93feb0f..20ef617f 100644
--- a/docs/source/releasing.rst
+++ b/docs/source/releasing.rst
@@ -20,10 +20,9 @@ ensures we get working online documentation. So this needs to be checked:
 - Check the other languages too
 
 If the Python API docs are missing, then it's likely to be a dependency problem.
-Sphinx needs dependencies installed, and that's done differently by tox (which
-uses `setup.py`) and RTD (which uses `docs/requirements.txt`). If the latter is
-outdated, the Python API docs don't render because Sphinx fails to import the
-packages.
+Sphinx needs dependencies installed, and that's now done via tox so that the
+dependencies come from ``setup.py`` and local builds should match what RTD does,
+but it's still not the same process, so it's best to check.
 
 Check metadata
 --------------

From 55e46f9c2ae21fb845515bb25eea6ea9ea224bd4 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Tue, 17 Jan 2023 22:30:08 +0100
Subject: [PATCH 179/183] Update to ymmsl-python 0.13

---
 .github/workflows/ci_ubuntu20.04.yaml       | 2 +-
 .github/workflows/ci_ubuntu20.04_clang.yaml | 2 +-
 .github/workflows/ci_ubuntu20.04_intel.yaml | 2 +-
 .github/workflows/ci_ubuntu22.04.yaml       | 2 +-
 .github/workflows/ci_ubuntu22.04_clang.yaml | 2 +-
 .github/workflows/ci_ubuntu22.04_intel.yaml | 2 +-
 setup.py                                    | 2 +-
 tox.ini                                     | 4 ++--
 8 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/ci_ubuntu20.04.yaml b/.github/workflows/ci_ubuntu20.04.yaml
index f3c51d10..3c86bc54 100644
--- a/.github/workflows/ci_ubuntu20.04.yaml
+++ b/.github/workflows/ci_ubuntu20.04.yaml
@@ -16,4 +16,4 @@ jobs:
     - uses: actions/checkout@v2
 
     - name: Run tests on Ubuntu 20.04
-      run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:20.04 /bin/bash -c 'apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential cmake gfortran git valgrind libopenmpi-dev pkg-config python3 python3-pip python3-venv curl && apt-get -y remove libssl-dev zlib1g-dev && useradd -m -d /home/muscle3 muscle3 && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && su muscle3 -c -- "pip3 install --user -U \"pip<22\" setuptools wheel" && su muscle3 -c -- "pip3 install --user \"ymmsl>=0.12.0,<0.13\" qcg-pilotjob==0.13.1" && su muscle3 -c -- "cd /home/muscle3/muscle3 && make test_examples"'
+      run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:20.04 /bin/bash -c 'apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential cmake gfortran git valgrind libopenmpi-dev pkg-config python3 python3-pip python3-venv curl && apt-get -y remove libssl-dev zlib1g-dev && useradd -m -d /home/muscle3 muscle3 && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && su muscle3 -c -- "pip3 install --user -U \"pip<22\" setuptools wheel" && su muscle3 -c -- "pip3 install --user \"ymmsl>=0.13.0,<0.14\" qcg-pilotjob==0.13.1" && su muscle3 -c -- "cd /home/muscle3/muscle3 && make test_examples"'
diff --git a/.github/workflows/ci_ubuntu20.04_clang.yaml b/.github/workflows/ci_ubuntu20.04_clang.yaml
index 749aca38..32b021a6 100644
--- a/.github/workflows/ci_ubuntu20.04_clang.yaml
+++ b/.github/workflows/ci_ubuntu20.04_clang.yaml
@@ -16,4 +16,4 @@ jobs:
     - uses: actions/checkout@v2
 
     - name: Run tests on Ubuntu 20.04 with Clang
-      run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:20.04 /bin/bash -c 'apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential clang cmake gfortran git valgrind libopenmpi-dev pkg-config python3 python3-pip python3-venv curl && apt-get -y remove libssl-dev zlib1g-dev && useradd -m -d /home/muscle3 muscle3 && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && su muscle3 -c -- "pip3 install --user -U \"pip<22\" setuptools wheel" && su muscle3 -c -- "pip3 install --user \"ymmsl>=0.12.0,<0.13\" qcg-pilotjob==0.13.1" && su muscle3 -c -- "cd /home/muscle3/muscle3 && CXXFLAGS=-fPIE OMPI_CXX=clang++ CXX=clang++ make test_examples"'
+      run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:20.04 /bin/bash -c 'apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential clang cmake gfortran git valgrind libopenmpi-dev pkg-config python3 python3-pip python3-venv curl && apt-get -y remove libssl-dev zlib1g-dev && useradd -m -d /home/muscle3 muscle3 && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && su muscle3 -c -- "pip3 install --user -U \"pip<22\" setuptools wheel" && su muscle3 -c -- "pip3 install --user \"ymmsl>=0.13.0,<0.14\" qcg-pilotjob==0.13.1" && su muscle3 -c -- "cd /home/muscle3/muscle3 && CXXFLAGS=-fPIE OMPI_CXX=clang++ CXX=clang++ make test_examples"'
diff --git a/.github/workflows/ci_ubuntu20.04_intel.yaml b/.github/workflows/ci_ubuntu20.04_intel.yaml
index c39874c8..c21b6cf3 100644
--- a/.github/workflows/ci_ubuntu20.04_intel.yaml
+++ b/.github/workflows/ci_ubuntu20.04_intel.yaml
@@ -19,4 +19,4 @@ jobs:
     - uses: actions/checkout@v2
 
     - name: Run tests on Ubuntu 20.04 with Clang
-      run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:20.04 /bin/bash -c 'apt-get update && apt-get -y install wget && wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB && mv GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB /etc/apt/trusted.gpg.d/intel-sw-products.asc && echo "deb https://apt.repos.intel.com/oneapi all main" >/etc/apt/sources.list.d/oneAPI.list && apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential cmake git valgrind pkg-config python3 python3-pip python3-venv curl intel-oneapi-compiler-dpcpp-cpp-2021.1.1 intel-oneapi-compiler-fortran-2021.1.1 intel-oneapi-mpi-devel-2021.1.1 && apt-get -y remove libssl-dev zlib1g-dev && useradd -m -d /home/muscle3 muscle3 && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && su muscle3 -c -- "pip3 install --user -U \"pip<22\" setuptools wheel" && su muscle3 -c -- "pip3 install --user \"ymmsl>=0.12.0,<0.13\" qcg-pilotjob==0.13.1" && su muscle3 -c -s /bin/bash -- "cd /home/muscle3/muscle3 && . /opt/intel/oneapi/setvars.sh && CXX=icpx MPICXX=\"mpiicpc -cxx=icpx\" FC=ifx MPIFC=\"mpiifort -fc=ifx\" make test_examples"'
+      run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:20.04 /bin/bash -c 'apt-get update && apt-get -y install wget && wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB && mv GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB /etc/apt/trusted.gpg.d/intel-sw-products.asc && echo "deb https://apt.repos.intel.com/oneapi all main" >/etc/apt/sources.list.d/oneAPI.list && apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential cmake git valgrind pkg-config python3 python3-pip python3-venv curl intel-oneapi-compiler-dpcpp-cpp-2021.1.1 intel-oneapi-compiler-fortran-2021.1.1 intel-oneapi-mpi-devel-2021.1.1 && apt-get -y remove libssl-dev zlib1g-dev && useradd -m -d /home/muscle3 muscle3 && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && su muscle3 -c -- "pip3 install --user -U \"pip<22\" setuptools wheel" && su muscle3 -c -- "pip3 install --user \"ymmsl>=0.13.0,<0.14\" qcg-pilotjob==0.13.1" && su muscle3 -c -s /bin/bash -- "cd /home/muscle3/muscle3 && . /opt/intel/oneapi/setvars.sh && CXX=icpx MPICXX=\"mpiicpc -cxx=icpx\" FC=ifx MPIFC=\"mpiifort -fc=ifx\" make test_examples"'
diff --git a/.github/workflows/ci_ubuntu22.04.yaml b/.github/workflows/ci_ubuntu22.04.yaml
index 37debedc..2a188074 100644
--- a/.github/workflows/ci_ubuntu22.04.yaml
+++ b/.github/workflows/ci_ubuntu22.04.yaml
@@ -16,4 +16,4 @@ jobs:
     - uses: actions/checkout@v2
 
     - name: Run tests on Ubuntu 22.04
-      run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:22.04 /bin/bash -c 'apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential cmake gfortran git valgrind libopenmpi-dev pkg-config python3 python3-pip python3-venv curl && apt-get -y remove libssl-dev && useradd -m -d /home/muscle3 muscle3 && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && su muscle3 -c -- "pip3 install -U pip setuptools wheel" && su muscle3 -c -- "pip3 install \"ymmsl>=0.12.0,<0.13\" qcg-pilotjob==0.13.1" && su muscle3 -c -- "cd /home/muscle3/muscle3 && make test_examples"'
+      run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:22.04 /bin/bash -c 'apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential cmake gfortran git valgrind libopenmpi-dev pkg-config python3 python3-pip python3-venv curl && apt-get -y remove libssl-dev && useradd -m -d /home/muscle3 muscle3 && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && su muscle3 -c -- "pip3 install -U pip setuptools wheel" && su muscle3 -c -- "pip3 install \"ymmsl>=0.13.0,<0.14\" qcg-pilotjob==0.13.1" && su muscle3 -c -- "cd /home/muscle3/muscle3 && make test_examples"'
diff --git a/.github/workflows/ci_ubuntu22.04_clang.yaml b/.github/workflows/ci_ubuntu22.04_clang.yaml
index 3c20e9e8..5c3431d5 100644
--- a/.github/workflows/ci_ubuntu22.04_clang.yaml
+++ b/.github/workflows/ci_ubuntu22.04_clang.yaml
@@ -16,4 +16,4 @@ jobs:
     - uses: actions/checkout@v2
 
     - name: Run tests on Ubuntu 22.04 with Clang
-      run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:22.04 /bin/bash -c 'apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential clang cmake gfortran git valgrind libopenmpi-dev pkg-config python3 python3-pip python3-venv curl && apt-get -y remove libssl-dev && useradd -m -d /home/muscle3 muscle3 && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && su muscle3 -c -- "pip3 install -U pip setuptools wheel" && su muscle3 -c -- "pip3 install \"ymmsl>=0.12.0,<0.13\" qcg-pilotjob==0.13.1" && su muscle3 -c -- "cd /home/muscle3/muscle3 && CXXFLAGS=-fPIE OMPI_CXX=clang++ CXX=clang++ make test_examples"'
+      run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:22.04 /bin/bash -c 'apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential clang cmake gfortran git valgrind libopenmpi-dev pkg-config python3 python3-pip python3-venv curl && apt-get -y remove libssl-dev && useradd -m -d /home/muscle3 muscle3 && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && su muscle3 -c -- "pip3 install -U pip setuptools wheel" && su muscle3 -c -- "pip3 install \"ymmsl>=0.13.0,<0.14\" qcg-pilotjob==0.13.1" && su muscle3 -c -- "cd /home/muscle3/muscle3 && CXXFLAGS=-fPIE OMPI_CXX=clang++ CXX=clang++ make test_examples"'
diff --git a/.github/workflows/ci_ubuntu22.04_intel.yaml b/.github/workflows/ci_ubuntu22.04_intel.yaml
index b1c11cc5..fb34e175 100644
--- a/.github/workflows/ci_ubuntu22.04_intel.yaml
+++ b/.github/workflows/ci_ubuntu22.04_intel.yaml
@@ -17,4 +17,4 @@ jobs:
     - uses: actions/checkout@v2
 
     - name: Run tests on Ubuntu 22.04 with the Intel compiler
-      run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:22.04 /bin/bash -c 'apt-get update && apt-get -y install wget && wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB && mv GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB /etc/apt/trusted.gpg.d/intel-sw-products.asc && echo "deb https://apt.repos.intel.com/oneapi all main" >/etc/apt/sources.list.d/oneAPI.list && apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential cmake git valgrind pkg-config python3 python3-pip python3-venv curl intel-oneapi-compiler-dpcpp-cpp intel-oneapi-compiler-fortran intel-oneapi-mpi-devel && apt-get -y remove libssl-dev && useradd -m -d /home/muscle3 muscle3 && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && su muscle3 -c -- "pip3 install -U pip setuptools wheel" && su muscle3 -c -- "pip3 install \"ymmsl>=0.12.0,<0.13\" qcg-pilotjob==0.13.1" && su muscle3 -c -- "cd /home/muscle3/muscle3 && . /opt/intel/oneapi/setvars.sh && MPICXX=\"mpiicpc -cxx=icpx\" CXX=icpx MPIFC=\"mpiifort -fc=ifx\" FC=ifx make test_examples"'
+      run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:22.04 /bin/bash -c 'apt-get update && apt-get -y install wget && wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB && mv GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB /etc/apt/trusted.gpg.d/intel-sw-products.asc && echo "deb https://apt.repos.intel.com/oneapi all main" >/etc/apt/sources.list.d/oneAPI.list && apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential cmake git valgrind pkg-config python3 python3-pip python3-venv curl intel-oneapi-compiler-dpcpp-cpp intel-oneapi-compiler-fortran intel-oneapi-mpi-devel && apt-get -y remove libssl-dev && useradd -m -d /home/muscle3 muscle3 && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && su muscle3 -c -- "pip3 install -U pip setuptools wheel" && su muscle3 -c -- "pip3 install \"ymmsl>=0.13.0,<0.14\" qcg-pilotjob==0.13.1" && su muscle3 -c -- "cd /home/muscle3/muscle3 && . /opt/intel/oneapi/setvars.sh && MPICXX=\"mpiicpc -cxx=icpx\" CXX=icpx MPIFC=\"mpiifort -fc=ifx\" FC=ifx make test_examples"'
diff --git a/setup.py b/setup.py
index 40470f97..50cd2ab0 100644
--- a/setup.py
+++ b/setup.py
@@ -57,7 +57,7 @@
         "numpy>=1.22,<=1.25; python_version>='3.8'",
         'qcg-pilotjob==0.13.1',
         'typing_extensions<4',
-        'ymmsl>=0.12.0,<0.13'          # Also in CI, update there as well
+        'ymmsl>=0.13.0,<0.14'          # Also in CI, update there as well
     ],
     extras_require={
         'dev': [
diff --git a/tox.ini b/tox.ini
index 56038721..3ef4eb68 100644
--- a/tox.ini
+++ b/tox.ini
@@ -8,7 +8,7 @@ deps =
     flake8<6
     pytest
     pytest-cov
-    git+https://github.com/multiscale/ymmsl-python.git@develop#egg=ymmsl
+    ymmsl
 
 passenv =
     MUSCLE_TEST_PYTHON_ONLY
@@ -42,6 +42,6 @@ deps =
     sphinx_rtd_theme
     sphinx-click
     sphinx-tabs
-    git+https://github.com/multiscale/ymmsl-python.git@develop#egg=ymmsl
+    ymmsl
 commands = sphinx-build docs/source docs/build -bhtml
 

From 3c6de46d60eda5c233d51e6096866d94e68a4ccf Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Tue, 17 Jan 2023 22:36:12 +0100
Subject: [PATCH 180/183] Add version 0.6.0 to the change log

---
 CHANGELOG.rst | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 0f09b2f6..b7266209 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -5,6 +5,37 @@ Change Log
 All notable changes to this project will be documented in this file.
 This project adheres to `Semantic Versioning <http://semver.org/>`_.
 
+0.6.0
+*****
+
+Added
+-----
+
+* Connecting multiple conduits to outgoing ports
+* Checkpointing (preview, not fully reliable and open to change)
+* Clang support
+* Intel® compiler support
+* Error in case different versions of MUSCLE3 are used
+
+Improved
+--------
+
+* TCP latency (performance)
+* More helpful messages for configuration errors
+* Small documentation improvements
+
+Removed
+-------
+
+* Python 3.6 support
+
+Thanks
+------
+
+* Maarten at Ignition Computing for implementing much of the above
+* The ITER Organisation for funding much of this work
+
+
 0.5.0
 *****
 

From 8080efbfee8b82e7da92dbfa5a6ab787f32106f9 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Tue, 17 Jan 2023 22:42:20 +0100
Subject: [PATCH 181/183] Set release version to 0.6.0

---
 VERSION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/VERSION b/VERSION
index a5156d36..a918a2aa 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.5.1.dev
+0.6.0

From 45fda4ac3919a0055a5a3829b29ae1b1c4198ae6 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Tue, 17 Jan 2023 22:51:52 +0100
Subject: [PATCH 182/183] Fix examples dependency versions

---
 docs/source/examples/python/requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/examples/python/requirements.txt b/docs/source/examples/python/requirements.txt
index cabe0c71..f160cac5 100644
--- a/docs/source/examples/python/requirements.txt
+++ b/docs/source/examples/python/requirements.txt
@@ -2,7 +2,7 @@ matplotlib>=3,<4
 numpy<1.22; python_version=='3.7'
 numpy>=1.22,<=1.25; python_version>='3.8'
 sobol_seq==0.2.0
-yatiml==0.9.0
-git+https://github.com/multiscale/ymmsl-python.git@develop#egg=ymmsl
+yatiml>=0.10.0,<0.11
+ymmsl>=0.13,<0.14
 qcg-pilotjob==0.13.1
 

From acf95d1242fd06cd4c573ba048d4834daefa31e2 Mon Sep 17 00:00:00 2001
From: Lourens Veen <l.veen@esciencecenter.nl>
Date: Tue, 17 Jan 2023 23:00:44 +0100
Subject: [PATCH 183/183] Update badges to point to master

---
 README.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.rst b/README.rst
index 504fd99f..432a7bde 100644
--- a/README.rst
+++ b/README.rst
@@ -1,11 +1,11 @@
 .. image:: https://github.com/multiscale/muscle3/raw/develop/docs/source/muscle3_logo_readme.png
     :alt: MUSCLE3
 
-.. image:: https://readthedocs.org/projects/muscle3/badge/?version=develop
-    :target: https://muscle3.readthedocs.io/en/develop/?badge=develop
+.. image:: https://readthedocs.org/projects/muscle3/badge/?version=master
+    :target: https://muscle3.readthedocs.io/en/develop/?badge=master
     :alt: Documentation Build Status
 
-.. image:: https://github.com/multiscale/muscle3/workflows/continuous_integration/badge.svg?branch=develop
+.. image:: https://github.com/multiscale/muscle3/workflows/continuous_integration/badge.svg?branch=master
     :target: https://github.com/multiscale/muscle3/actions
     :alt: Build Status