From fa0aa3a2bb185893de61736cb4c7796307c5b7f6 Mon Sep 17 00:00:00 2001 From: Abel Soares Siqueira Date: Thu, 3 Feb 2022 14:03:34 +0100 Subject: [PATCH 001/183] Add cffconvert.yml to validate CITATION.cff --- .github/workflows/cffconvert.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 .github/workflows/cffconvert.yml diff --git a/.github/workflows/cffconvert.yml b/.github/workflows/cffconvert.yml new file mode 100644 index 00000000..707a71c4 --- /dev/null +++ b/.github/workflows/cffconvert.yml @@ -0,0 +1,19 @@ +name: cffconvert + +on: + push: + paths: + - CITATION.cff + +jobs: + validate: + name: "validate" + runs-on: ubuntu-latest + steps: + - name: Check out a copy of the repository + uses: actions/checkout@v2 + + - name: Check whether the citation metadata from CITATION.cff is valid + uses: citation-file-format/cffconvert-github-action@2.0.0 + with: + args: "--validate" From a327e141b8e035599ed23dc25d9d8d52232fbb54 Mon Sep 17 00:00:00 2001 From: Abel Soares Siqueira Date: Thu, 3 Feb 2022 14:03:34 +0100 Subject: [PATCH 002/183] Update CITATION.cff cffversion to 1.2.0 --- CITATION.cff | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CITATION.cff b/CITATION.cff index 9b2c5304..71dbcb44 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,6 +1,6 @@ # YAML 1.2 --- -cff-version: "1.1.0" +cff-version: 1.2.0 title: "MUSCLE 3: The Multiscale Coupling Library and Environment" doi: "10.5281/zenodo.3258864" From 38761c9676ed14c038072a33079c0c6cdfc0e09b Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 9 Aug 2022 13:34:14 +0200 Subject: [PATCH 003/183] Set --muscle_manager in test_mpi_macro_micro Fixes the test case integration_test/test_mpi_macro_micro.py from failing when the default port (9000) is occupied by another process when running `make test`. --- integration_test/test_mpi_macro_micro.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/integration_test/test_mpi_macro_micro.py b/integration_test/test_mpi_macro_micro.py index af0c4890..2b02fa76 100644 --- a/integration_test/test_mpi_macro_micro.py +++ b/integration_test/test_mpi_macro_micro.py @@ -11,8 +11,9 @@ from .conftest import skip_if_python_only -def run_macro(instance_id: str): +def run_macro(instance_id: str, muscle_manager: str): sys.argv.append('--muscle-instance={}'.format(instance_id)) + sys.argv.append('--muscle-manager={}'.format(muscle_manager)) macro() @@ -61,7 +62,8 @@ def test_mpi_macro_micro(tmpdir, mmp_server_process_simple): str(mpi_test_micro), '--muscle-instance=micro'], env=env) # run macro model - macro_process = mp.Process(target=run_macro, args=('macro',)) + macro_process = mp.Process(target=run_macro, + args=('macro', mmp_server_process_simple)) macro_process.start() # check results From 6eba65c0655fb7dea1b52efddf2da6d6b167888e Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 9 Aug 2022 13:38:05 +0200 Subject: [PATCH 004/183] Fix flake8 error Fixes the following flake8 v5.0.4 error: - libmuscle/python/libmuscle/instance.py:443:20: E275 missing whitespace after keyword --- libmuscle/python/libmuscle/instance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index e0e6d542..24c10d5a 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -440,7 +440,7 @@ def __receive_message( if port.operator == Operator.F_INIT: if (port_name, slot) in self._f_init_cache: msg = self._f_init_cache[(port_name, slot)] - del(self._f_init_cache[(port_name, slot)]) + del self._f_init_cache[(port_name, slot)] if with_settings and msg.settings is None: err_msg = ('If you use receive_with_settings()' ' on an F_INIT port, then you have to' From 36d37e50f54c26b5b0c9b3cafada23e20ad72616 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Thu, 15 Sep 2022 15:57:55 +0200 Subject: [PATCH 005/183] Add ITER Organization as copyright holder. Welcome and thank you! --- NOTICE | 1 + README.rst | 3 ++- docs/source/conf.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/NOTICE b/NOTICE index 9110265d..2a353538 100644 --- a/NOTICE +++ b/NOTICE @@ -1,2 +1,3 @@ MUSCLE3 Copyright 2018-2022, Netherlands eScience Center and University of Amsterdam +Copyright 2022, The ITER Organization diff --git a/README.rst b/README.rst index ec9a7352..27405971 100644 --- a/README.rst +++ b/README.rst @@ -40,7 +40,8 @@ Legal ===== MUSCLE3 is Copyright 2018-2022 University of Amsterdam and Netherlands eScience -Center. It is licensed under the Apache License 2.0. +Center, and Copyright 2022 ITER Organisation. It is licensed under the Apache +License 2.0. Contributing diff --git a/docs/source/conf.py b/docs/source/conf.py index c9a0c10c..7464d1c0 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -67,7 +67,7 @@ # General information about the project. project = 'muscle3' -copyright = '2018-2022 University of Amsterdam and Netherlands eScience Center' +copyright = '2018-2022 University of Amsterdam and Netherlands eScience Center, 2022 The ITER Organization' author = 'Lourens Veen' # The version info for the project you're documenting, acts as replacement for From 50241cd24e715b83c73027d7c80e8ab53ebe4ee5 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 15 Sep 2022 12:55:27 +0200 Subject: [PATCH 006/183] Reduce delays - instance_manager.py: - Increase polling frequency of LogHandlingThread. Reduces the shutdown delay (in InstanceManager.shutdown), saves 0-900 ms. - qcgpj_instantiator.py - Add a 10ms delay at the start of QCGPJInstantiator._main to allow the main process some time for submitting InstantiationRequests. Saves 90ms in startup duration. - Do not sleep in QCGPJInstantiator._main when a shutdown request is received. If all instances exited successfully we can immediately be done and save a 100ms wait. - tcp_transport_server.py - Set poll_interval of SocketServer.serve_forever to 100ms. Saves 0-400 ms in shutdown duration (in instances and muscle_manager). Combined, these changes save 190ms to ~2 seconds for a run started with `muscle_manager --start-all`. This is most notable for short runs, like the ones in the unit tests. `make test` duration, averaged over 5 runs (no compilation): - Before this commit: 50.51 seconds - After this commit: 29.51 seconds --- libmuscle/python/libmuscle/manager/instance_manager.py | 2 +- libmuscle/python/libmuscle/manager/qcgpj_instantiator.py | 7 ++++++- libmuscle/python/libmuscle/mcp/tcp_transport_server.py | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/libmuscle/python/libmuscle/manager/instance_manager.py b/libmuscle/python/libmuscle/manager/instance_manager.py index 139c6beb..9d2c8b30 100644 --- a/libmuscle/python/libmuscle/manager/instance_manager.py +++ b/libmuscle/python/libmuscle/manager/instance_manager.py @@ -41,7 +41,7 @@ def run(self) -> None: """The thread's entry point.""" while True: try: - record = self._queue.get(True, 1.0) + record = self._queue.get(True, 0.1) logger = logging.getLogger(record.name) logger.handle(record) except queue.Empty: diff --git a/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py b/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py index e3575b11..89f8bc3e 100644 --- a/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py +++ b/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py @@ -148,9 +148,12 @@ async def _main(self) -> None: """ qcg_iters = dict() # type: Dict[Reference, qcg_SchedulingIteration] + await asyncio.sleep(0.01) # allow requests_in queue to be populated + shutting_down = False done = False while not done: + do_sleep = True while not shutting_down: try: request = self._requests_in.get_nowait() @@ -158,6 +161,7 @@ async def _main(self) -> None: _logger.debug('Got ShutdownRequest') self._state_tracker.stop_processing = True shutting_down = True + do_sleep = False elif isinstance(request, CancelAllRequest): _logger.debug('Got CancelAllRequest') @@ -178,7 +182,8 @@ async def _main(self) -> None: except queue.Empty: break - await asyncio.sleep(0.1) + if do_sleep: + await asyncio.sleep(0.1) for name, process in list(self._state_tracker.processes.items()): if process.status.is_finished(): diff --git a/libmuscle/python/libmuscle/mcp/tcp_transport_server.py b/libmuscle/python/libmuscle/mcp/tcp_transport_server.py index 79513420..2219cd76 100644 --- a/libmuscle/python/libmuscle/mcp/tcp_transport_server.py +++ b/libmuscle/python/libmuscle/mcp/tcp_transport_server.py @@ -71,7 +71,7 @@ def __init__(self, handler: RequestHandler, port: int = 0) -> None: self._server = TcpTransportServerImpl(('', port), TcpHandler, self) self._server_thread = threading.Thread( - target=self._server.serve_forever, daemon=True) + target=self._server.serve_forever, args=(0.1,), daemon=True) self._server_thread.start() def get_location(self) -> str: From 900d621ab84c4e70d078cab1c6c95ac3c598755f Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 16 Sep 2022 10:20:59 +0200 Subject: [PATCH 007/183] Update qcgpj sleep --- libmuscle/python/libmuscle/manager/qcgpj_instantiator.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py b/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py index 89f8bc3e..a150904d 100644 --- a/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py +++ b/libmuscle/python/libmuscle/manager/qcgpj_instantiator.py @@ -153,7 +153,6 @@ async def _main(self) -> None: shutting_down = False done = False while not done: - do_sleep = True while not shutting_down: try: request = self._requests_in.get_nowait() @@ -161,7 +160,6 @@ async def _main(self) -> None: _logger.debug('Got ShutdownRequest') self._state_tracker.stop_processing = True shutting_down = True - do_sleep = False elif isinstance(request, CancelAllRequest): _logger.debug('Got CancelAllRequest') @@ -182,9 +180,6 @@ async def _main(self) -> None: except queue.Empty: break - if do_sleep: - await asyncio.sleep(0.1) - for name, process in list(self._state_tracker.processes.items()): if process.status.is_finished(): _logger.debug(f'Reporting {name} done') @@ -195,6 +190,9 @@ async def _main(self) -> None: _logger.debug(f'Done: {self._state_tracker.processes}') done = len(self._state_tracker.processes) == 0 + if not done: + await asyncio.sleep(0.1) + _logger.debug('Stopping executor') await self._executor.stop() From a38e719fb8e2de07c18ffd3945537088df3e9cf9 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 16 Sep 2022 13:39:34 +0200 Subject: [PATCH 008/183] Create overloads for Instance.get_setting Allows typecheckers (e.g. mypy) to deduce the correct type when setting the typ argument. --- libmuscle/python/libmuscle/instance.py | 37 ++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index 24c10d5a..af84052e 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -2,7 +2,7 @@ import logging import os import sys -from typing import cast, Dict, List, Optional, Tuple +from typing import Literal, cast, Dict, List, Optional, Tuple, overload from ymmsl import (Identifier, Operator, SettingValue, Port, Reference, Settings) @@ -157,6 +157,35 @@ def error_shutdown(self, message: str) -> None: """ self.__shutdown(message) + @overload + def get_setting(self, name: str, typ: Literal['str']) -> str: + ... + + @overload + def get_setting(self, name: str, typ: Literal['int']) -> int: + ... + + @overload + def get_setting(self, name: str, typ: Literal['float']) -> float: + ... + + @overload + def get_setting(self, name: str, typ: Literal['bool']) -> bool: + ... + + @overload + def get_setting(self, name: str, typ: Literal['[float]']) -> List[float]: + ... + + @overload + def get_setting( + self, name: str, typ: Literal['[[float]]']) -> List[List[float]]: + ... + + @overload + def get_setting(self, name: str, typ: None = None) -> SettingValue: + ... + def get_setting(self, name: str, typ: Optional[str] = None ) -> SettingValue: """Returns the value of a model setting. @@ -620,8 +649,7 @@ def _set_remote_log_level(self) -> None: """ try: - log_level_str = cast( - str, self.get_setting('muscle_remote_log_level', 'str')) + log_level_str = self.get_setting('muscle_remote_log_level', 'str') except KeyError: # muscle_remote_log_level not set, do nothing and keep the default return @@ -656,8 +684,7 @@ def _set_local_log_level(self) -> None: """ try: - log_level_str = cast( - str, self.get_setting('muscle_local_log_level', 'str')) + log_level_str = self.get_setting('muscle_local_log_level', 'str') log_level = LogLevel[log_level_str.upper()] if log_level is None: From 9ace491608632862fd42963582631b3022df36d2 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 16 Sep 2022 14:00:58 +0200 Subject: [PATCH 009/183] Import Literal from typing_extensions for py<3.8 --- libmuscle/python/libmuscle/instance.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index af84052e..513018d6 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -2,7 +2,9 @@ import logging import os import sys -from typing import Literal, cast, Dict, List, Optional, Tuple, overload +from typing import cast, Dict, List, Optional, Tuple, overload +# TODO: import from typing module when dropping support for python 3.7 +from typing_extensions import Literal from ymmsl import (Identifier, Operator, SettingValue, Port, Reference, Settings) From c32e9fa4ebf01dfcc49d57c0c5d2e0bc1d169c0d Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 26 Aug 2022 11:23:45 +0200 Subject: [PATCH 010/183] Implement checkpoint triggers and tests --- .../python/libmuscle/snapshot_manager.py | 151 ++++++++++++++++++ .../libmuscle/test/test_snapshot_manager.py | 133 +++++++++++++++ 2 files changed, 284 insertions(+) create mode 100644 libmuscle/python/libmuscle/snapshot_manager.py create mode 100644 libmuscle/python/libmuscle/test/test_snapshot_manager.py diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py new file mode 100644 index 00000000..36b742d8 --- /dev/null +++ b/libmuscle/python/libmuscle/snapshot_manager.py @@ -0,0 +1,151 @@ +import bisect +from typing import List, Optional, Union + +from ymmsl import CheckpointRange, CheckpointRules + + +class CheckpointTrigger: + """Represents a trigger for creating snapshots""" + + def next_checkpoint(self, cur_time: float) -> Optional[float]: + """Calculate the next checkpoint time + + Args: + cur_time: current time. + + Returns: + The time when a next checkpoint should be taken, or None if this + trigger has no checkpoint after cur_time. + """ + raise NotImplementedError() + + def previous_checkpoint(self, cur_time: float) -> Optional[float]: + """Calculate the previous checkpoint time + + Args: + cur_time: current time. + + Returns: + The time when a previous checkpoint should have been taken, or None + if this trigger has no checkpoint after cur_time. + """ + raise NotImplementedError() + + +class AtCheckpointTrigger(CheckpointTrigger): + """Represents a trigger based on an "at" checkpoint rule + + This triggers at the specified times. + """ + + def __init__(self, at: List[Union[float, int]]) -> None: + """Create an "at" checkpoint trigger + + Args: + at: list of checkpoint moments + """ + self._at = at + self._at.sort() # ymmsl already sorts, but just to be sure + + def next_checkpoint(self, cur_time: float) -> Optional[float]: + if cur_time >= self._at[-1]: + return None # no future checkpoint left + idx = bisect.bisect(self._at, cur_time) + return self._at[idx] + + def previous_checkpoint(self, cur_time: float) -> Optional[float]: + if cur_time < self._at[0]: + return None # no previous checkpoint + idx = bisect.bisect(self._at, cur_time) + return self._at[idx - 1] + + +class RangeCheckpointTrigger(CheckpointTrigger): + """Represents a trigger based on a "ranges" checkpoint rule + + This triggers at a range of checkpoint moments. + + Equivalent an "at" rule ``[start, start + step, start + 2*step, ...]`` for + as long as ``start + i*step <= stop``. + + Stop may be omitted, in which case the range is infinite. + + Start may be omitted, in which case the range is equivalent to an "at" rule + ``[..., -n*step, ..., -step, 0, step, 2*step, ...]`` for as long as + ``i*step <= stop``. + + Note: the "every" rule is a special case of a range with start and stop + omitted, and is handled by this class as well + """ + + def __init__(self, range: CheckpointRange) -> None: + """Create a range of checkpoints + + Args: + range: checkpoint range defining start, stop and step. + """ + self._start = range.start + self._stop = range.stop + self._step = range.step + self._last = None # type: Union[int, float, None] + if self._stop is not None: + start = 0 if self._start is None else self._start + diff = self._stop - start + self._last = start + (diff // self._step) * self._step + + def next_checkpoint(self, cur_time: float) -> Optional[float]: + if self._start is not None and cur_time < self._start: + return float(self._start) + if self._last is not None and cur_time >= self._last: + return None + start = 0 if self._start is None else self._start + diff = cur_time - start + return float(start + (diff // self._step + 1) * self._step) + + def previous_checkpoint(self, cur_time: float) -> Optional[float]: + if self._start is not None and cur_time < self._start: + return None + if self._last is not None and cur_time > self._last: + return float(self._last) + start = 0 if self._start is None else self._start + diff = cur_time - start + return float(start + (diff // self._step) * self._step) + + +class CombinedCheckpointTriggers(CheckpointTrigger): + """Checkpoint trigger based on a combination of "every", "at" and "ranges" + """ + + def __init__(self, checkpoint_rules: CheckpointRules) -> None: + """Create a new combined checkpoint trigger from the given rules + + Args: + checkpoint_rules: checkpoint rules (from ymmsl) defining "every", + "at", and/or "ranges" rules + """ + self._triggers = [] # type: List[CheckpointTrigger] + if checkpoint_rules.every is not None: + cp_range = CheckpointRange(step=checkpoint_rules.every) + self._triggers.append(RangeCheckpointTrigger(cp_range)) + if checkpoint_rules.at: + self._triggers.append(AtCheckpointTrigger(checkpoint_rules.at)) + for cp_range in checkpoint_rules.ranges: + self._triggers.append(RangeCheckpointTrigger(cp_range)) + + def next_checkpoint(self, cur_time: float) -> Optional[float]: + checkpoints = (trigger.next_checkpoint(cur_time) + for trigger in self._triggers) + # return earliest of all not-None next-checkpoints + return min((checkpoint + for checkpoint in checkpoints + if checkpoint is not None), + default=None) # return None if all triggers return None + + def previous_checkpoint(self, cur_time: float) -> Optional[float]: + checkpoints = (trigger.previous_checkpoint(cur_time) + for trigger in self._triggers) + # return latest of all not-None previous-checkpoints + return max((checkpoint + for checkpoint in checkpoints + if checkpoint is not None), + default=None) # return None if all triggers return None diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py new file mode 100644 index 00000000..b557e5b8 --- /dev/null +++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py @@ -0,0 +1,133 @@ +import pytest +from ymmsl import CheckpointRange, CheckpointRules + +from libmuscle.snapshot_manager import ( + CombinedCheckpointTriggers, AtCheckpointTrigger, RangeCheckpointTrigger) + + +def test_at_checkpoint_trigger(): + trigger = AtCheckpointTrigger([1, 3, 4, 4.5, 9]) + + assert trigger.next_checkpoint(0) == 1 + assert trigger.previous_checkpoint(0) is None + + assert trigger.next_checkpoint(1) == 3 + assert trigger.previous_checkpoint(1) == 1 + + eps = 1e-16 + assert trigger.next_checkpoint(1 - eps) == 1 + assert trigger.previous_checkpoint(1 - eps) is None + + assert trigger.next_checkpoint(3.9) == 4 + assert trigger.previous_checkpoint(3.9) == 3 + + assert trigger.next_checkpoint(4.1) == 4.5 + assert trigger.previous_checkpoint(4.1) == 4 + + assert trigger.next_checkpoint(5) == 9 + assert trigger.previous_checkpoint(5) == 4.5 + + assert trigger.next_checkpoint(9) is None + assert trigger.previous_checkpoint(9) == 9 + + assert trigger.next_checkpoint(11) is None + assert trigger.previous_checkpoint(11) == 9 + + +def test_range_checkpoint_trigger(): + range = CheckpointRange(start=0, stop=20, step=1.2) + trigger = RangeCheckpointTrigger(range) + + assert trigger.next_checkpoint(-1) == 0 + assert trigger.previous_checkpoint(-1) is None + + assert trigger.next_checkpoint(0) == pytest.approx(1.2) + assert trigger.previous_checkpoint(0) == 0 + + assert trigger.next_checkpoint(8) == pytest.approx(8.4) + assert trigger.previous_checkpoint(8) == pytest.approx(7.2) + + assert trigger.next_checkpoint(18.2) == pytest.approx(19.2) + assert trigger.previous_checkpoint(18.2) == pytest.approx(18) + + assert trigger.next_checkpoint(20) is None + assert trigger.previous_checkpoint(20) == pytest.approx(19.2) + + +def test_range_checkpoint_trigger_default_stop(): + range = CheckpointRange(start=1, step=1.2) + trigger = RangeCheckpointTrigger(range) + + assert trigger.next_checkpoint(-1.) == 1 + assert trigger.previous_checkpoint(-1.) is None + + assert trigger.next_checkpoint(148148.) == pytest.approx(148148.2) + assert trigger.previous_checkpoint(148148.) == pytest.approx(148147) + + assert trigger.next_checkpoint(148148148.) == pytest.approx(148148149) + assert trigger.previous_checkpoint(148148148.) == pytest.approx(148148147.8) + + +def test_range_checkpoint_trigger_default_start(): + range = CheckpointRange(step=1.2, stop=10) + trigger = RangeCheckpointTrigger(range) + + assert trigger.next_checkpoint(10) is None + assert trigger.previous_checkpoint(10) == pytest.approx(9.6) + + assert trigger.next_checkpoint(0.0) == pytest.approx(1.2) + assert trigger.previous_checkpoint(0.0) == pytest.approx(0.0) + + assert trigger.next_checkpoint(-148148.) == pytest.approx(-148147.2) + assert trigger.previous_checkpoint(-148148.) == pytest.approx(-148148.4) + + +def test_combined_checkpoint_trigger_every_at(): + rules = CheckpointRules(every=10, at=[3, 7, 13, 17]) + trigger = CombinedCheckpointTriggers(rules) + + assert trigger.next_checkpoint(-11.) == pytest.approx(-10) + assert trigger.previous_checkpoint(-11) == pytest.approx(-20) + + assert trigger.next_checkpoint(0.) == pytest.approx(3) + assert trigger.previous_checkpoint(0.) == pytest.approx(0) + + assert trigger.next_checkpoint(8.3) == pytest.approx(10) + assert trigger.previous_checkpoint(8.3) == pytest.approx(7) + + assert trigger.next_checkpoint(14.2) == pytest.approx(17) + assert trigger.previous_checkpoint(14.2) == pytest.approx(13) + + assert trigger.next_checkpoint(25.2) == pytest.approx(30) + assert trigger.previous_checkpoint(25.2) == pytest.approx(20) + + +def test_combined_checkpoint_trigger_at_ranges(): + rules = CheckpointRules(at=[3, 7, 13, 17], ranges=[ + CheckpointRange(start=0, step=5, stop=20), + CheckpointRange(start=20, step=20, stop=100)]) + trigger = CombinedCheckpointTriggers(rules) + + assert trigger.next_checkpoint(-11.) == pytest.approx(0) + assert trigger.previous_checkpoint(-11) is None + + assert trigger.next_checkpoint(0.) == pytest.approx(3) + assert trigger.previous_checkpoint(0.) == pytest.approx(0) + + assert trigger.next_checkpoint(8.3) == pytest.approx(10) + assert trigger.previous_checkpoint(8.3) == pytest.approx(7) + + assert trigger.next_checkpoint(14.2) == pytest.approx(15) + assert trigger.previous_checkpoint(14.2) == pytest.approx(13) + + assert trigger.next_checkpoint(19.3) == pytest.approx(20) + assert trigger.previous_checkpoint(19.3) == pytest.approx(17) + + assert trigger.next_checkpoint(25.2) == pytest.approx(40) + assert trigger.previous_checkpoint(25.2) == pytest.approx(20) + + assert trigger.next_checkpoint(95.2) == pytest.approx(100) + assert trigger.previous_checkpoint(95.2) == pytest.approx(80) + + assert trigger.next_checkpoint(125.2) is None + assert trigger.previous_checkpoint(125.2) == pytest.approx(100) From 27bf3c21cbc253b72675132e8e91e4bf19510f21 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 26 Aug 2022 11:38:18 +0200 Subject: [PATCH 011/183] [tox] add ymmsl feature branch as dependency --- tox.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/tox.ini b/tox.ini index 0e2a1348..23fb19f3 100644 --- a/tox.ini +++ b/tox.ini @@ -8,6 +8,7 @@ deps = flake8 pytest pytest-cov + git+https://github.com/multiscale/ymmsl-python.git@feature/checkpointing#egg=ymmsl passenv = MUSCLE_TEST_PYTHON_ONLY From a2e6a97909b7356dabc51c34fbd5155e9d0088f4 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 26 Aug 2022 16:35:05 +0200 Subject: [PATCH 012/183] Implement message counters on Port --- libmuscle/python/libmuscle/port.py | 70 +++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 2 deletions(-) diff --git a/libmuscle/python/libmuscle/port.py b/libmuscle/python/libmuscle/port.py index beb49249..9aced2d5 100644 --- a/libmuscle/python/libmuscle/port.py +++ b/libmuscle/python/libmuscle/port.py @@ -1,8 +1,20 @@ -from typing import List, Optional +from typing import List, Optional, TypeVar + from ymmsl import Identifier, Operator import ymmsl +_T = TypeVar("_T") + + +def _extend_list_to_size(lst: List[_T], size: int, padding: _T) -> None: + """When lst is smaller than size, extend to size using padding as values + """ + num_extend = size - len(lst) + if num_extend > 0: + lst += [padding] * num_extend + + class Port(ymmsl.Port): """Represents a gateway to the outside world. @@ -10,12 +22,18 @@ class Port(ymmsl.Port): an operator, as well as a set of dimensions that determines the valid slot indices for sending or receiving on this port. + Ports keep track of the amount of messages sent or received on the port. + However, the actual incrementing and validation is done in + :class:`Communicator`. + Attributes: name (Identifier): Name of this port. operator (Operator): Operator associated with this port. """ + def __init__(self, name: str, operator: Operator, is_vector: bool, - is_connected: bool, our_ndims: int, peer_dims: List[int] + is_connected: bool, our_ndims: int, peer_dims: List[int], + num_messages: Optional[List[int]] = None ) -> None: """Create a Port. @@ -68,6 +86,13 @@ def __init__(self, name: str, operator: Operator, is_vector: bool, self._is_open = [True] self._is_resizable = is_vector and (our_ndims == len(peer_dims)) + self._num_messages = [] # type: List[int] + self._is_resuming = [] # type: List[bool] + if num_messages is not None: + self._num_messages = num_messages + self._is_resuming = [True] * len(num_messages) + _extend_list_to_size(self._num_messages, self._length or 1, 0) + _extend_list_to_size(self._is_resuming, self._length or 1, False) # Note: I'm not sure how this will develop exactly, so this class has some # accessors even if those are un-Pythonic; in the future a simple variable @@ -129,6 +154,11 @@ def set_length(self, length: int) -> None: if length != self._length: self._length = length self._is_open = [True] * self._length + # Using extend here to not discard any information about message + # numbers between resizes. Note that _num_messages and _is_resuming + # may be longer than self._length! + _extend_list_to_size(self._num_messages, self._length, 0) + _extend_list_to_size(self._is_resuming, self._length, False) def set_closed(self, slot: Optional[int] = None) -> None: """Marks this port as closed. @@ -137,3 +167,39 @@ def set_closed(self, slot: Optional[int] = None) -> None: self._is_open[slot] = False else: self._is_open = [False] + + def increment_num_messages(self, slot: Optional[int] = None) -> None: + """Increment amount of messages sent or received. + + Args: + slot: The slot that is sent/received on + """ + self._num_messages[slot or 0] += 1 + self.set_resumed(slot) + + def get_num_messages(self, slot: Optional[int] = None) -> int: + """Get the amount of messages sent or received. + + Args: + slot: The slot that is sent/received on + """ + return self._num_messages[slot or 0] + + def is_resuming(self, slot: Optional[int] = None) -> bool: + """True when this port has resumed. + + After resumption, each port/slot may discard exactly one message. + is_resuming keeps track of this state. + + Args: + slot: The slot that is sent/received on + """ + return self._is_resuming[slot or 0] + + def set_resumed(self, slot: Optional[int] = None) -> None: + """Mark that this port has resumed and may no longer discard messages. + + Args: + slot: The slot that is sent/received on + """ + self._is_resuming[slot or 0] = False From 38168a324c5a3bb1d87d9a1519701d70c134f195 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 26 Aug 2022 16:48:06 +0200 Subject: [PATCH 013/183] Add message numbers to MMPMessage --- integration_test/test_cpp_mpp_client.py | 2 +- libmuscle/python/libmuscle/communicator.py | 48 +++++++++++++------ libmuscle/python/libmuscle/mpp_message.py | 7 ++- .../libmuscle/test/test_communicator.py | 24 ++++++---- .../python/libmuscle/test/test_mpp_message.py | 10 ++-- .../python/libmuscle/test/test_outbox.py | 1 + 6 files changed, 62 insertions(+), 30 deletions(-) diff --git a/integration_test/test_cpp_mpp_client.py b/integration_test/test_cpp_mpp_client.py index a08e7bc4..7541993e 100644 --- a/integration_test/test_cpp_mpp_client.py +++ b/integration_test/test_cpp_mpp_client.py @@ -23,7 +23,7 @@ def tcp_server_process(control_pipe): message = MPPMessage( Reference('test_sender.test_port'), receiver, - 10, 1.0, 2.0, settings, data).encoded() + 10, 1.0, 2.0, settings, 0, data).encoded() def handle_request(request_bytes): request = msgpack.unpackb(request_bytes, raw=False) diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py index efdcb30d..af7e14bd 100644 --- a/libmuscle/python/libmuscle/communicator.py +++ b/libmuscle/python/libmuscle/communicator.py @@ -79,6 +79,7 @@ def __init__(self, kernel: Reference, index: List[int], profiler: The profiler to use for recording sends and receives. """ + # TODO: pass a SnapshotManager and store as self._snapshot_manager self._kernel = kernel self._index = index self._declared_ports = declared_ports @@ -213,14 +214,16 @@ def send_message( snd_endpoint.port, slot_list) port_length = None - if self._ports[port_name].is_resizable(): - port_length = self._ports[port_name].get_length() + if port.is_resizable(): + port_length = port.get_length() mcp_message = MPPMessage(snd_endpoint.ref(), recv_endpoint.ref(), port_length, message.timestamp, message.next_timestamp, cast(Settings, message.settings), + port.get_num_messages(slot), message.data) + port.increment_num_messages(slot) encoded_message = mcp_message.encoded() self._post_office.deposit(recv_endpoint.ref(), encoded_message) profile_event.stop() @@ -257,12 +260,12 @@ def receive_message(self, port_name: str, slot: Optional[int] = None, connected. """ if slot is None: - _logger.debug('Waiting for message on {}'.format(port_name)) + port_and_slot = port_name slot_list = [] # type: List[int] else: - _logger.debug('Waiting for message on {}[{}]'.format( - port_name, slot)) + port_and_slot = f"{port_name}[{slot}]" slot_list = [slot] + _logger.debug('Waiting for message on {}'.format(port_and_slot)) recv_endpoint = self.__get_endpoint(port_name, slot_list) @@ -311,15 +314,26 @@ def receive_message(self, port_name: str, slot: Optional[int] = None, profile_event.port_length = port.get_length() profile_event.message_size = len(mcp_message_bytes) - if slot is None: - _logger.debug('Received message on {}'.format(port_name)) - if isinstance(mcp_message.data, ClosePort): - _logger.debug('Port {} is now closed'.format(port_name)) - else: - _logger.debug('Received message on {}[{}]'.format(port_name, slot)) - if isinstance(mcp_message.data, ClosePort): - _logger.debug('Port {}[{}] is now closed'.format( - port_name, slot)) + expected_message_number = port.get_num_messages(slot) + # TODO: handle f_init port counts for STATELESS and WEAKLY_STATEFUL + # components which didn't load a snapshot + if expected_message_number != mcp_message.message_number: + if (expected_message_number - 1 == mcp_message.message_number and + port.is_resuming(slot)): + _logger.debug(f'Discarding received message on {port_and_slot}' + ': resuming from weakly consistent snapshot') + port.set_resumed() + return self.receive_message(port_name, slot, default) + raise RuntimeError(f'Received message on {port_and_slot} with' + ' unexpected message number' + f' {mcp_message.message_number}. Was expecting' + f' {expected_message_number}. Are you resuming' + ' from an inconsistent snapshot?') + port.increment_num_messages(slot) + + _logger.debug('Received message on {}'.format(port_and_slot)) + if isinstance(mcp_message.data, ClosePort): + _logger.debug('Port {} is now closed'.format(port_and_slot)) return message @@ -380,6 +394,8 @@ def __ports_from_declared(self) -> Dict[str, Port]: ports[port_name] = Port( port_name, operator, is_vector, is_connected, len(self._index), port_peer_dims) + # TODO: retrieve num_messages[] for this port from + # self._snapshot_manager when resuming return ports def __ports_from_conduits(self, conduits: List[Conduit] @@ -411,6 +427,8 @@ def __ports_from_conduits(self, conduits: List[Conduit] ports[str(port_id)] = Port( str(port_id), operator, is_vector, is_connected, len(self._index), port_peer_dims) + # TODO: retrieve num_messages[] for this port from + # self._snapshot_manager when resuming return ports def __settings_in_port(self, conduits: List[Conduit]) -> Port: @@ -430,6 +448,8 @@ def __settings_in_port(self, conduits: List[Conduit]) -> Port: conduit.sending_component())) return Port('muscle_settings_in', Operator.F_INIT, False, False, len(self._index), []) + # TODO: retrieve num_messages[] for this port from + # self._snapshot_manager when resuming def __get_client(self, instance: Reference) -> MPPClient: """Get or create a client to connect to the given instance. diff --git a/libmuscle/python/libmuscle/mpp_message.py b/libmuscle/python/libmuscle/mpp_message.py index 15ff09f9..69ea4563 100644 --- a/libmuscle/python/libmuscle/mpp_message.py +++ b/libmuscle/python/libmuscle/mpp_message.py @@ -151,7 +151,7 @@ class MPPMessage: def __init__(self, sender: Reference, receiver: Reference, port_length: Optional[int], timestamp: float, next_timestamp: Optional[float], - settings_overlay: Settings, data: Any + settings_overlay: Settings, message_number: int, data: Any ) -> None: """Create an MPPMessage. @@ -177,6 +177,7 @@ def __init__(self, sender: Reference, receiver: Reference, self.timestamp = timestamp self.next_timestamp = next_timestamp self.settings_overlay = settings_overlay + self.message_number = message_number if isinstance(data, np.ndarray): self.data = Grid(data) else: @@ -197,11 +198,12 @@ def from_bytes(message: bytes) -> 'MPPMessage': timestamp = message_dict["timestamp"] next_timestamp = message_dict["next_timestamp"] settings_overlay = message_dict["settings_overlay"] + message_number = message_dict["message_number"] data = message_dict["data"] return MPPMessage( sender, receiver, port_length, timestamp, next_timestamp, - settings_overlay, data) + settings_overlay, message_number, data) def encoded(self) -> bytes: """Encode the message and return as a bytes buffer. @@ -213,6 +215,7 @@ def encoded(self) -> bytes: 'timestamp': self.timestamp, 'next_timestamp': self.next_timestamp, 'settings_overlay': self.settings_overlay, + 'message_number': self.message_number, 'data': self.data } diff --git a/libmuscle/python/libmuscle/test/test_communicator.py b/libmuscle/python/libmuscle/test/test_communicator.py index 8f0f1238..a4f3a751 100644 --- a/libmuscle/python/libmuscle/test/test_communicator.py +++ b/libmuscle/python/libmuscle/test/test_communicator.py @@ -281,6 +281,7 @@ def test_send_message(communicator, message) -> None: assert msg.timestamp == 0.0 assert msg.next_timestamp is None assert msg.settings_overlay == Settings() + assert msg.message_number == 0 assert msg.data == b'test' @@ -304,6 +305,7 @@ def test_send_msgpack(communicator, message2) -> None: assert msg.sender == 'kernel[13].out' assert msg.receiver == 'other.in[13]' assert msg.settings_overlay == Settings() + assert msg.message_number == 0 assert msg.data == {'test': 17} @@ -318,6 +320,7 @@ def test_send_message_with_slot(communicator2, message) -> None: assert msg.sender == 'other.out[13]' assert msg.receiver == 'kernel[13].in' assert msg.settings_overlay == Settings() + assert msg.message_number == 0 assert msg.data == b'test' @@ -348,6 +351,7 @@ def test_send_message_with_settings(communicator, message) -> None: assert msg.sender == 'kernel[13].out' assert msg.receiver == 'other.in[13]' assert msg.settings_overlay.as_ordered_dict() == {'test2': 'testing'} + assert msg.message_number == 0 assert msg.data == b'test' @@ -363,6 +367,7 @@ def test_send_settings(communicator, message) -> None: assert msg.sender == 'kernel[13].out' assert msg.receiver == 'other.in[13]' assert msg.settings_overlay == Settings() + assert msg.message_number == 0 assert msg.data == Settings({'test1': 'testing'}) @@ -378,6 +383,7 @@ def test_close_port(communicator) -> None: assert msg.timestamp == float('inf') assert msg.next_timestamp is None assert msg.settings_overlay == Settings() + assert msg.message_number == 0 assert isinstance(msg.data, ClosePort) @@ -385,7 +391,7 @@ def test_receive_message(communicator) -> None: client_mock = MagicMock() client_mock.receive.return_value = MPPMessage( Reference('other.out[13]'), Reference('kernel[13].in'), - None, 0.0, None, Settings({'test1': 12}), + None, 0.0, None, Settings({'test1': 12}), 0, b'test').encoded() get_client_mock = MagicMock(return_value=client_mock) communicator._Communicator__get_client = get_client_mock @@ -424,7 +430,7 @@ def test_receive_msgpack(communicator) -> None: client_mock = MagicMock() client_mock.receive.return_value = MPPMessage( Reference('other.out[13]'), Reference('kernel[13].in'), - None, 0.0, None, Settings({'test1': 12}), + None, 0.0, None, Settings({'test1': 12}), 0, {'test': 13}).encoded() get_client_mock = MagicMock(return_value=client_mock) communicator._Communicator__get_client = get_client_mock @@ -441,7 +447,7 @@ def test_receive_with_slot(communicator2) -> None: client_mock = MagicMock() client_mock.receive.return_value = MPPMessage( Reference('kernel[13].out'), Reference('other.in[13]'), - None, 0.0, None, Settings({'test': 'testing'}), + None, 0.0, None, Settings({'test': 'testing'}), 0, b'test').encoded() get_client_mock = MagicMock(return_value=client_mock) communicator2._Communicator__get_client = get_client_mock @@ -459,7 +465,7 @@ def test_receive_message_resizable(communicator3) -> None: client_mock = MagicMock() client_mock.receive.return_value = MPPMessage( Reference('other.out[13]'), Reference('kernel.in[13]'), - 20, 0.0, None, Settings({'test': 'testing'}), + 20, 0.0, None, Settings({'test': 'testing'}), 0, b'test').encoded() get_client_mock = MagicMock(return_value=client_mock) communicator3._Communicator__get_client = get_client_mock @@ -477,7 +483,7 @@ def test_receive_with_settings(communicator) -> None: client_mock = MagicMock() client_mock.receive.return_value = MPPMessage( Reference('other.out[13]'), Reference('kernel[13].in'), - None, 0.0, None, Settings({'test2': 3.1}), + None, 0.0, None, Settings({'test2': 3.1}), 0, b'test').encoded() get_client_mock = MagicMock(return_value=client_mock) communicator._Communicator__get_client = get_client_mock @@ -496,7 +502,7 @@ def test_receive_msgpack_with_slot_and_settings(communicator2) -> None: client_mock.receive.return_value = MPPMessage( Reference('kernel[13].out'), Reference('other.in[13]'), None, 0.0, 1.0, - Settings({'test': 'testing'}), 'test').encoded() + Settings({'test': 'testing'}), 0, 'test').encoded() get_client_mock = MagicMock(return_value=client_mock) communicator2._Communicator__get_client = get_client_mock communicator2._profiler = MagicMock() @@ -513,7 +519,7 @@ def test_receive_settings(communicator) -> None: client_mock = MagicMock() client_mock.receive.return_value = MPPMessage( Reference('other.out[13]'), Reference('kernel[13].in'), - None, 0.0, None, Settings({'test1': 12}), + None, 0.0, None, Settings({'test1': 12}), 0, Settings({'test': 13})).encoded() get_client_mock = MagicMock(return_value=client_mock) communicator._Communicator__get_client = get_client_mock @@ -531,7 +537,7 @@ def test_receive_close_port(communicator) -> None: client_mock = MagicMock() client_mock.receive.return_value = MPPMessage( Reference('other.out[13]'), Reference('kernel[13].in'), - None, 0.0, None, Settings(), ClosePort()).encoded() + None, 0.0, None, Settings(), 0, ClosePort()).encoded() get_client_mock = MagicMock(return_value=client_mock) communicator._Communicator__get_client = get_client_mock communicator._profiler = MagicMock() @@ -545,6 +551,6 @@ def test_get_message(communicator, message) -> None: communicator.send_message('out', message) ref_message = MPPMessage( Reference('kernel[13].out'), Reference('other.in[13]'), - None, 0.0, None, Settings(), b'test').encoded() + None, 0.0, None, Settings(), 0, b'test').encoded() assert communicator._post_office.get_message( 'other.in[13]') == ref_message diff --git a/libmuscle/python/libmuscle/test/test_mpp_message.py b/libmuscle/python/libmuscle/test/test_mpp_message.py index 79ee8ee6..dce3ed88 100644 --- a/libmuscle/python/libmuscle/test/test_mpp_message.py +++ b/libmuscle/python/libmuscle/test/test_mpp_message.py @@ -18,13 +18,14 @@ def test_create() -> None: data = (12345).to_bytes(2, 'little', signed=True) msg = MPPMessage( sender, receiver, None, timestamp, next_timestamp, - settings_overlay, data) + settings_overlay, 0, data) assert msg.sender == sender assert msg.receiver == receiver assert msg.port_length is None assert msg.timestamp == 10.0 assert msg.next_timestamp == 11.0 assert msg.settings_overlay == settings_overlay + assert msg.message_number == 0 assert msg.data == data @@ -43,7 +44,7 @@ def test_grid_encode() -> None: grid = Grid(array, ['x', 'y', 'z']) msg = MPPMessage( sender, receiver, None, timestamp, next_timestamp, Settings(), - grid) + 0, grid) wire_data = msg.encoded() mcp_decoded = msgpack.unpackb(wire_data, raw=False) @@ -86,6 +87,7 @@ def test_grid_decode() -> None: 'timestamp': 0.0, 'next_timestamp': None, 'settings_overlay': msgpack.ExtType(1, settings_data), + 'message_number': 0, 'data': msgpack.ExtType(2, grid_data)} wire_data = msgpack.packb(msg_dict, use_bin_type=True) @@ -135,7 +137,7 @@ def test_grid_roundtrip() -> None: grid = Grid(array, ['x', 'y', 'z']) msg = MPPMessage( sender, receiver, None, timestamp, next_timestamp, Settings(), - grid) + 0, grid) wire_data = msg.encoded() msg_out = MPPMessage.from_bytes(wire_data) @@ -169,7 +171,7 @@ def test_non_contiguous_grid_roundtrip() -> None: grid = Grid(array.real, ['a', 'b', 'c']) msg = MPPMessage( sender, receiver, None, timestamp, next_timestamp, Settings(), - grid) + 0, grid) wire_data = msg.encoded() msg_out = MPPMessage.from_bytes(wire_data) diff --git a/libmuscle/python/libmuscle/test/test_outbox.py b/libmuscle/python/libmuscle/test/test_outbox.py index 6b22f068..cb4af31a 100644 --- a/libmuscle/python/libmuscle/test/test_outbox.py +++ b/libmuscle/python/libmuscle/test/test_outbox.py @@ -19,6 +19,7 @@ def message(): Ref('sender.out'), Ref('receiver.in'), None, 0.0, 1.0, bytes(), + 0, 'testing'.encode('utf-8')) From 5764f9e4155c9831f08e54420fa7e77c132fc234 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 29 Aug 2022 09:49:40 +0200 Subject: [PATCH 014/183] Implement message counters on Port (C++) --- libmuscle/cpp/src/libmuscle/port.cpp | 61 ++++++++++++++++++++++++++- libmuscle/cpp/src/libmuscle/port.hpp | 63 +++++++++++++++++++++++++++- 2 files changed, 122 insertions(+), 2 deletions(-) diff --git a/libmuscle/cpp/src/libmuscle/port.cpp b/libmuscle/cpp/src/libmuscle/port.cpp index 691c75cf..2cb119ce 100644 --- a/libmuscle/cpp/src/libmuscle/port.cpp +++ b/libmuscle/cpp/src/libmuscle/port.cpp @@ -11,12 +11,26 @@ using ymmsl::Identifier; using ymmsl::Operator; +namespace { + +template< typename T> +inline void extend_vector_to_size( + std::vector &vec, const int minsize, const T &val) { + if(static_cast(vec.size()) < minsize) { + vec.resize(minsize, val); + } +} + +} + + namespace libmuscle { namespace impl { Port::Port( std::string const & name, Operator oper, bool is_vector, bool is_connected, - int our_ndims, std::vector peer_dims) + int our_ndims, std::vector peer_dims, + std::vector num_messages) : ::ymmsl::Port(Identifier(name), oper) { is_connected_ = is_connected; @@ -53,6 +67,12 @@ Port::Port( } is_resizable_ = is_vector && (our_ndims == static_cast(peer_dims.size())); + if (!num_messages.empty()) { + num_messages_ = num_messages; + is_resuming_.resize(num_messages_.size(), true); + } + extend_vector_to_size(num_messages_, std::min(1, length_), 0); + extend_vector_to_size(is_resuming_, std::min(1, length_), false); } bool Port::is_connected() const { @@ -94,6 +114,11 @@ void Port::set_length(int length) { if (length != length_) { length_ = length; is_open_ = std::vector(length_, true); + // Using extend here to not discard any information about message + // numbers between resizes. Note that _num_messages and _is_resuming + // may be longer than self._length! + extend_vector_to_size(num_messages_, std::min(1, length_), 0); + extend_vector_to_size(is_resuming_, std::min(1, length_), false); } } @@ -105,5 +130,39 @@ void Port::set_closed(int slot) { is_open_[slot] = false; } +void Port::increment_num_messages() { + num_messages_[0] ++; + set_resumed(); +} + +void Port::increment_num_messages(int slot) { + num_messages_[slot] ++; + set_resumed(slot); +} + +int Port::get_num_messages() const { + return num_messages_[0]; +} + +int Port::get_num_messages(int slot) const { + return num_messages_[slot]; +} + +bool Port::is_resuming() const { + return is_resuming_[0]; +} + +bool Port::is_resuming(int slot) const { + return is_resuming_[slot]; +} + +void Port::set_resumed() { + is_resuming_[0] = false; +} + +void Port::set_resumed(int slot) { + is_resuming_[slot] = false; +} + } } diff --git a/libmuscle/cpp/src/libmuscle/port.hpp b/libmuscle/cpp/src/libmuscle/port.hpp index 41bf8898..e0b6c61c 100644 --- a/libmuscle/cpp/src/libmuscle/port.hpp +++ b/libmuscle/cpp/src/libmuscle/port.hpp @@ -11,6 +11,10 @@ namespace libmuscle { namespace impl { * Ports can be used to send or receive messages. They have a name and an * operator, as well as a set of dimensions that determines the valid slot * indices for sending or receiving on this port. + * + * Ports keep track of the amount of messages sent or received on the port. + * However, the actual incrementing and validation is done in + * Communicator. */ class Port : public ::ymmsl::Port { public: @@ -26,7 +30,8 @@ class Port : public ::ymmsl::Port { Port( std::string const & name, ::ymmsl::Operator oper, bool is_vector, bool is_connected, - int our_ndims, std::vector peer_dims); + int our_ndims, std::vector peer_dims, + std::vector num_messages=std::vector(0)); // Note: we only ever use this Port in libmuscle, and only use // ymmsl::Port in ymmsl. Port objects are always handled by value, so @@ -104,11 +109,67 @@ class Port : public ::ymmsl::Port { */ void set_closed(int slot); + /** Increment amount of messages sent or received. + */ + void increment_num_messages(); + + /** Increment amount of messages sent or received. + * + * Only valid for vector ports. + * + * @param slot The slot that is sent/received on + */ + void increment_num_messages(int slot); + + /** Get the amount of messages sent or received + */ + int get_num_messages() const; + + /** Get the amount of messages sent or received + * + * Only valid for vector ports. + * + * @param slot The slot that is sent/received on + */ + int get_num_messages(int slot) const; + + /** True when this port has resumed. + * + * After resumption, each port/slot may discard exactly one message. + * is_resuming keeps track of this state. + */ + bool is_resuming() const; + + /** True when this port has resumed. + * + * After resumption, each port/slot may discard exactly one message. + * is_resuming keeps track of this state. + * + * Only valid for vector ports. + * + * @param slot The slot that is sent/received on + */ + bool is_resuming(int slot) const; + + /** Mark that this port has resumed and may no longer discard messages. + */ + void set_resumed(); + + /** Mark that this port has resumed and may no longer discard messages. + * + * Only valid for vector ports. + * + * @param slot The slot that is sent/received on + */ + void set_resumed(int slot); + private: bool is_connected_; int length_; bool is_resizable_; std::vector is_open_; + std::vector num_messages_; + std::vector is_resuming_; }; } } From c702220acfbd7eb16f1838c0c7ff410ac4c28a65 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 29 Aug 2022 10:56:07 +0200 Subject: [PATCH 015/183] Fix bugs and add Optional call signatures in Port --- libmuscle/cpp/src/libmuscle/port.cpp | 36 ++++++++++++++++++++++++---- libmuscle/cpp/src/libmuscle/port.hpp | 27 +++++++++++++++++++++ 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/libmuscle/cpp/src/libmuscle/port.cpp b/libmuscle/cpp/src/libmuscle/port.cpp index 2cb119ce..c4e7b4f9 100644 --- a/libmuscle/cpp/src/libmuscle/port.cpp +++ b/libmuscle/cpp/src/libmuscle/port.cpp @@ -71,8 +71,8 @@ Port::Port( num_messages_ = num_messages; is_resuming_.resize(num_messages_.size(), true); } - extend_vector_to_size(num_messages_, std::min(1, length_), 0); - extend_vector_to_size(is_resuming_, std::min(1, length_), false); + extend_vector_to_size(num_messages_, std::max(1, length_), 0); + extend_vector_to_size(is_resuming_, std::max(1, length_), false); } bool Port::is_connected() const { @@ -117,8 +117,8 @@ void Port::set_length(int length) { // Using extend here to not discard any information about message // numbers between resizes. Note that _num_messages and _is_resuming // may be longer than self._length! - extend_vector_to_size(num_messages_, std::min(1, length_), 0); - extend_vector_to_size(is_resuming_, std::min(1, length_), false); + extend_vector_to_size(num_messages_, std::max(1, length_), 0); + extend_vector_to_size(is_resuming_, std::max(1, length_), false); } } @@ -140,6 +140,13 @@ void Port::increment_num_messages(int slot) { set_resumed(slot); } +void Port::increment_num_messages(Optional slot) { + if(slot.is_set()) + increment_num_messages(slot.get()); + else + increment_num_messages(); +} + int Port::get_num_messages() const { return num_messages_[0]; } @@ -148,6 +155,13 @@ int Port::get_num_messages(int slot) const { return num_messages_[slot]; } +int Port::get_num_messages(Optional slot) const { + if(slot.is_set()) + return get_num_messages(slot.get()); + else + return get_num_messages(); +} + bool Port::is_resuming() const { return is_resuming_[0]; } @@ -156,6 +170,13 @@ bool Port::is_resuming(int slot) const { return is_resuming_[slot]; } +bool Port::is_resuming(Optional slot) const { + if(slot.is_set()) + return is_resuming(slot.get()); + else + return is_resuming(); +} + void Port::set_resumed() { is_resuming_[0] = false; } @@ -164,5 +185,12 @@ void Port::set_resumed(int slot) { is_resuming_[slot] = false; } +void Port::set_resumed(Optional slot) { + if(slot.is_set()) + set_resumed(slot.get()); + else + set_resumed(); +} + } } diff --git a/libmuscle/cpp/src/libmuscle/port.hpp b/libmuscle/cpp/src/libmuscle/port.hpp index e0b6c61c..913a1917 100644 --- a/libmuscle/cpp/src/libmuscle/port.hpp +++ b/libmuscle/cpp/src/libmuscle/port.hpp @@ -121,6 +121,12 @@ class Port : public ::ymmsl::Port { */ void increment_num_messages(int slot); + /** Increment amount of messages sent or received. + * + * @param slot The slot that is sent/received on + */ + void increment_num_messages(Optional slot); + /** Get the amount of messages sent or received */ int get_num_messages() const; @@ -133,6 +139,12 @@ class Port : public ::ymmsl::Port { */ int get_num_messages(int slot) const; + /** Get the amount of messages sent or received + * + * @param slot The slot that is sent/received on + */ + int get_num_messages(Optional slot) const; + /** True when this port has resumed. * * After resumption, each port/slot may discard exactly one message. @@ -151,6 +163,15 @@ class Port : public ::ymmsl::Port { */ bool is_resuming(int slot) const; + /** True when this port has resumed. + * + * After resumption, each port/slot may discard exactly one message. + * is_resuming keeps track of this state. + * + * @param slot The slot that is sent/received on + */ + bool is_resuming(Optional slot) const; + /** Mark that this port has resumed and may no longer discard messages. */ void set_resumed(); @@ -163,6 +184,12 @@ class Port : public ::ymmsl::Port { */ void set_resumed(int slot); + /** Mark that this port has resumed and may no longer discard messages. + * + * @param slot The slot that is sent/received on + */ + void set_resumed(Optional slot); + private: bool is_connected_; int length_; From bb0a1e12b5c31cd7c07289a8584f99631cfc5116 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 29 Aug 2022 10:57:54 +0200 Subject: [PATCH 016/183] Add message numbers to MMPMessage (C++) --- libmuscle/cpp/src/libmuscle/communicator.cpp | 36 ++++++++++++++++--- libmuscle/cpp/src/libmuscle/mpp_message.cpp | 4 +++ libmuscle/cpp/src/libmuscle/mpp_message.hpp | 3 +- .../libmuscle/tests/mocks/mock_mpp_client.cpp | 2 +- .../tests/mocks/mock_post_office.cpp | 4 +-- .../tests/tcp_transport_server_test.cpp | 1 + .../src/libmuscle/tests/test_mpp_message.cpp | 8 +++-- .../cpp/src/libmuscle/tests/test_outbox.cpp | 1 + .../src/libmuscle/tests/test_post_office.cpp | 2 +- .../tests/test_tcp_communication.cpp | 3 +- 10 files changed, 52 insertions(+), 12 deletions(-) diff --git a/libmuscle/cpp/src/libmuscle/communicator.cpp b/libmuscle/cpp/src/libmuscle/communicator.cpp index 4e9c5139..b74b4497 100644 --- a/libmuscle/cpp/src/libmuscle/communicator.cpp +++ b/libmuscle/cpp/src/libmuscle/communicator.cpp @@ -116,7 +116,7 @@ void Communicator::send_message( // log sending on disconnected port return; - // Port const & port = ports_.at(port_name); + Port & port = ports_.at(port_name); // TODO start profile event @@ -126,13 +126,14 @@ void Communicator::send_message( Data settings_overlay(message.settings()); Optional port_length; - if (ports_.at(port_name).is_resizable()) - port_length = ports_.at(port_name).get_length(); + if (port.is_resizable()) + port_length = port.get_length(); MPPMessage mpp_message( snd_endpoint.ref(), recv_endpoint.ref(), port_length, message.timestamp(), Optional(), - settings_overlay, message.data()); + settings_overlay, port.get_num_messages(slot), message.data()); + port.increment_num_messages(slot); if (message.has_next_timestamp()) mpp_message.next_timestamp = message.next_timestamp(); @@ -204,6 +205,33 @@ Message Communicator::receive_message( // TODO stop and finalise profile event + int expected_message_number = port.get_num_messages(slot); + // TODO: handle f_init port counts for STATELESS and WEAKLY_STATEFUL + // components which didn't load a snapshot + if (expected_message_number != mpp_message.message_number) { + if (expected_message_number - 1 == mpp_message.message_number and + port.is_resuming(slot)) { + if (slot.is_set()) + logger_.debug("Discarding received message on ", port_name, + "[", slot.get(), "]: resuming from weakly", + " constistent snapshot"); + else + logger_.debug("Discarding received message on ", port_name, + ": resuming from weakly constistent snapshot"); + port.set_resumed(slot); + return receive_message(port_name, slot, default_msg); + } + std::ostringstream oss; + oss << "Received message on " << port_name; + if (slot.is_set()) + oss << "[" << slot.get() << "]"; + oss << " with unexpected message number " << mpp_message.message_number; + oss << ". Was expecting " << expected_message_number; + oss << ". Are you resuming from an inconsistent snapshot?"; + throw std::runtime_error(oss.str()); + } + port.increment_num_messages(slot); + if (slot.is_set()) logger_.debug("Received message on ", port_name, "[", slot.get(), "]"); else diff --git a/libmuscle/cpp/src/libmuscle/mpp_message.cpp b/libmuscle/cpp/src/libmuscle/mpp_message.cpp index 2962e31c..bf1be0f0 100644 --- a/libmuscle/cpp/src/libmuscle/mpp_message.cpp +++ b/libmuscle/cpp/src/libmuscle/mpp_message.cpp @@ -16,6 +16,7 @@ MPPMessage::MPPMessage( ::libmuscle::impl::Optional port_length, double timestamp, ::libmuscle::impl::Optional next_timestamp, DataConstRef const & settings_overlay, + int message_number, DataConstRef const & data ) : sender(sender) @@ -24,6 +25,7 @@ MPPMessage::MPPMessage( , timestamp(timestamp) , next_timestamp(next_timestamp) , settings_overlay(settings_overlay) + , message_number(message_number) , data(data) {} @@ -48,6 +50,7 @@ MPPMessage MPPMessage::from_bytes(DataConstRef const & data) { dict["timestamp"].as(), next_timestamp, dict["settings_overlay"], + dict["message_number"].as(), dict["data"]); } @@ -67,6 +70,7 @@ DataConstRef MPPMessage::encoded() const { "timestamp", timestamp, "next_timestamp", next_timestamp_data, "settings_overlay", settings_overlay, + "message_number", message_number, "data", data ); diff --git a/libmuscle/cpp/src/libmuscle/mpp_message.hpp b/libmuscle/cpp/src/libmuscle/mpp_message.hpp index 50e8a49b..96a26fe0 100644 --- a/libmuscle/cpp/src/libmuscle/mpp_message.hpp +++ b/libmuscle/cpp/src/libmuscle/mpp_message.hpp @@ -32,7 +32,7 @@ struct MPPMessage { ::ymmsl::Reference const & sender, ::ymmsl::Reference const & receiver, ::libmuscle::impl::Optional port_length, double timestamp, ::libmuscle::impl::Optional next_timestamp, - DataConstRef const & settings_overlay, + DataConstRef const & settings_overlay, int message_number, DataConstRef const & data); /** Create an MCP Message from an encoded buffer. @@ -53,6 +53,7 @@ struct MPPMessage { double timestamp; ::libmuscle::impl::Optional next_timestamp; DataConstRef settings_overlay; + int message_number; DataConstRef data; }; diff --git a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mpp_client.cpp b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mpp_client.cpp index c23dbafc..55ae3a76 100644 --- a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mpp_client.cpp +++ b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mpp_client.cpp @@ -44,7 +44,7 @@ Settings MockMPPClient::make_overlay_() { } MPPMessage MockMPPClient::next_receive_message( - "test.out", "test2.in", 0, 0.0, 1.0, make_overlay_(), Data::dict("test1", 12)); + "test.out", "test2.in", 0, 0.0, 1.0, make_overlay_(), 0, Data::dict("test1", 12)); Reference MockMPPClient::last_receiver("_none"); diff --git a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_post_office.cpp b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_post_office.cpp index e1d66eac..6d2bb3cc 100644 --- a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_post_office.cpp +++ b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_post_office.cpp @@ -6,13 +6,13 @@ int MockPostOffice::handle_request( char const * res_buf, std::size_t res_len, std::unique_ptr & response) { response = std::make_unique( - MPPMessage("test.out", "test2.in", 0, 0.0, 1.0, Data(), Data()).encoded()); + MPPMessage("test.out", "test2.in", 0, 0.0, 1.0, Data(), 0, Data()).encoded()); return -1; } std::unique_ptr MockPostOffice::get_response(int fd) { return std::make_unique( - MPPMessage("test.out", "test2.in", 0, 0.0, 1.0, Data(), Data()).encoded()); + MPPMessage("test.out", "test2.in", 0, 0.0, 1.0, Data(), 0, Data()).encoded()); } void MockPostOffice::deposit( diff --git a/libmuscle/cpp/src/libmuscle/tests/tcp_transport_server_test.cpp b/libmuscle/cpp/src/libmuscle/tests/tcp_transport_server_test.cpp index 9e069031..248f597f 100644 --- a/libmuscle/cpp/src/libmuscle/tests/tcp_transport_server_test.cpp +++ b/libmuscle/cpp/src/libmuscle/tests/tcp_transport_server_test.cpp @@ -38,6 +38,7 @@ int main(int argc, char *argv[]) { "test_sender.port", receiver, 10, 0.0, 1.0, overlay_settings, + 0, data_dict); auto msg_data = std::make_unique(msg.encoded()); post_office.deposit(receiver, std::move(msg_data)); diff --git a/libmuscle/cpp/src/libmuscle/tests/test_mpp_message.cpp b/libmuscle/cpp/src/libmuscle/tests/test_mpp_message.cpp index 88ba96c9..53f2ed28 100644 --- a/libmuscle/cpp/src/libmuscle/tests/test_mpp_message.cpp +++ b/libmuscle/cpp/src/libmuscle/tests/test_mpp_message.cpp @@ -29,7 +29,7 @@ TEST(test_mcp_message, create_mcp_message) { Reference("sender.port"), Reference("receiver.port"), 10, 100.1, 101.0, - test, abc + test, 0, abc ); ASSERT_EQ(m.sender, "sender.port"); @@ -38,6 +38,7 @@ TEST(test_mcp_message, create_mcp_message) { ASSERT_EQ(m.timestamp, 100.1); ASSERT_EQ(m.next_timestamp, 101.0); ASSERT_EQ(m.settings_overlay.as(), "test"); + ASSERT_EQ(m.message_number, 0); ASSERT_EQ(m.data.as(), "abc"); } @@ -48,7 +49,7 @@ TEST(test_mcp_message, create_mcp_message_minimal) { Reference("sender.port"), Reference("receiver.port"), {}, 100.1, {}, - test, abc + test, 0, abc ); ASSERT_EQ(m.sender, "sender.port"); @@ -57,6 +58,7 @@ TEST(test_mcp_message, create_mcp_message_minimal) { ASSERT_EQ(m.timestamp, 100.1); ASSERT_FALSE(m.next_timestamp.is_set()); ASSERT_TRUE(m.settings_overlay.is_nil()); + ASSERT_EQ(m.message_number, 0); ASSERT_TRUE(m.data.is_nil()); } @@ -68,6 +70,7 @@ TEST(test_mcp_message, from_bytes) { "timestamp", 100.1, "next_timestamp", Data(), "settings_overlay", Data(), + "message_number", 0, "data", Data() ); @@ -84,6 +87,7 @@ TEST(test_mcp_message, from_bytes) { ASSERT_EQ(m.timestamp, 100.1); ASSERT_FALSE(m.next_timestamp.is_set()); ASSERT_TRUE(m.settings_overlay.is_nil()); + ASSERT_EQ(m.message_number, 0); ASSERT_TRUE(m.data.is_nil()); } diff --git a/libmuscle/cpp/src/libmuscle/tests/test_outbox.cpp b/libmuscle/cpp/src/libmuscle/tests/test_outbox.cpp index 07486cb3..0d6769c5 100644 --- a/libmuscle/cpp/src/libmuscle/tests/test_outbox.cpp +++ b/libmuscle/cpp/src/libmuscle/tests/test_outbox.cpp @@ -36,6 +36,7 @@ TEST(libmuscle_outbox, test_deposit_retrieve_message) { Optional(), 0.0, 1.0, DataConstRef(), + 0, DataConstRef("testing")); auto message_data = std::make_unique(message.encoded()); diff --git a/libmuscle/cpp/src/libmuscle/tests/test_post_office.cpp b/libmuscle/cpp/src/libmuscle/tests/test_post_office.cpp index 68af8bf7..f6cf05c2 100644 --- a/libmuscle/cpp/src/libmuscle/tests/test_post_office.cpp +++ b/libmuscle/cpp/src/libmuscle/tests/test_post_office.cpp @@ -36,7 +36,7 @@ std::unique_ptr make_message() { "test_sender.port", "test_receiver.port", Optional(), 0.0, 1.0, - DataConstRef(), DataConstRef()); + DataConstRef(), 0, DataConstRef()); return std::make_unique(msg.encoded()); } diff --git a/libmuscle/cpp/src/libmuscle/tests/test_tcp_communication.cpp b/libmuscle/cpp/src/libmuscle/tests/test_tcp_communication.cpp index f9c60c30..c6400404 100644 --- a/libmuscle/cpp/src/libmuscle/tests/test_tcp_communication.cpp +++ b/libmuscle/cpp/src/libmuscle/tests/test_tcp_communication.cpp @@ -36,7 +36,7 @@ TEST(test_tcp_communication, send_receive) { MPPMessage msg( "test_sender.port", receiver, 10, 0.0, 1.0, - Data::dict("par1", 13), + Data::dict("par1", 13), 1, Data::dict("var1", 1, "var2", 2.0, "var3", "3")); auto msg_data = std::make_unique(msg.encoded()); post_office.deposit(receiver, std::move(msg_data)); @@ -53,6 +53,7 @@ TEST(test_tcp_communication, send_receive) { ASSERT_EQ(m.timestamp, 0.0); ASSERT_EQ(m.next_timestamp, 1.0); ASSERT_EQ(m.settings_overlay["par1"].as(), 13); + ASSERT_EQ(m.message_number, 1); ASSERT_EQ(m.data["var1"].as(), 1); ASSERT_EQ(m.data["var2"].as(), 2.0); ASSERT_EQ(m.data["var3"].as(), "3"); From 091c0511702c48a73bfc8822b9c296e4788efbd0 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 29 Aug 2022 10:58:12 +0200 Subject: [PATCH 017/183] Add missed slot argument --- libmuscle/python/libmuscle/communicator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py index af7e14bd..f565f6c5 100644 --- a/libmuscle/python/libmuscle/communicator.py +++ b/libmuscle/python/libmuscle/communicator.py @@ -322,7 +322,7 @@ def receive_message(self, port_name: str, slot: Optional[int] = None, port.is_resuming(slot)): _logger.debug(f'Discarding received message on {port_and_slot}' ': resuming from weakly consistent snapshot') - port.set_resumed() + port.set_resumed(slot) return self.receive_message(port_name, slot, default) raise RuntimeError(f'Received message on {port_and_slot} with' ' unexpected message number' From 2da60951f3dbdd9f47584cddb41197b9f11374be Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 29 Aug 2022 13:04:58 +0200 Subject: [PATCH 018/183] Getting/restoring port message counts --- libmuscle/cpp/src/libmuscle/port.cpp | 23 +++++++++++++++-------- libmuscle/cpp/src/libmuscle/port.hpp | 13 +++++++++++-- libmuscle/python/libmuscle/port.py | 25 ++++++++++++++++--------- 3 files changed, 42 insertions(+), 19 deletions(-) diff --git a/libmuscle/cpp/src/libmuscle/port.cpp b/libmuscle/cpp/src/libmuscle/port.cpp index c4e7b4f9..70db0550 100644 --- a/libmuscle/cpp/src/libmuscle/port.cpp +++ b/libmuscle/cpp/src/libmuscle/port.cpp @@ -29,8 +29,7 @@ namespace libmuscle { namespace impl { Port::Port( std::string const & name, Operator oper, bool is_vector, bool is_connected, - int our_ndims, std::vector peer_dims, - std::vector num_messages) + int our_ndims, std::vector peer_dims) : ::ymmsl::Port(Identifier(name), oper) { is_connected_ = is_connected; @@ -67,12 +66,8 @@ Port::Port( } is_resizable_ = is_vector && (our_ndims == static_cast(peer_dims.size())); - if (!num_messages.empty()) { - num_messages_ = num_messages; - is_resuming_.resize(num_messages_.size(), true); - } - extend_vector_to_size(num_messages_, std::max(1, length_), 0); - extend_vector_to_size(is_resuming_, std::max(1, length_), false); + num_messages_.resize(std::max(1, length_), 0); + is_resuming_.resize(std::max(1, length_), false); } bool Port::is_connected() const { @@ -130,6 +125,18 @@ void Port::set_closed(int slot) { is_open_[slot] = false; } +void Port::restore_message_counts(const std::vector &num_messages) { + num_messages_ = std::vector(num_messages); + is_resuming_.clear(); + is_resuming_.resize(num_messages_.size(), true); + extend_vector_to_size(num_messages_, std::max(1, length_), 0); + extend_vector_to_size(is_resuming_, std::max(1, length_), false); +} + +const std::vector & Port::get_message_counts() const { + return num_messages_; +} + void Port::increment_num_messages() { num_messages_[0] ++; set_resumed(); diff --git a/libmuscle/cpp/src/libmuscle/port.hpp b/libmuscle/cpp/src/libmuscle/port.hpp index 913a1917..18cfb5d9 100644 --- a/libmuscle/cpp/src/libmuscle/port.hpp +++ b/libmuscle/cpp/src/libmuscle/port.hpp @@ -30,8 +30,7 @@ class Port : public ::ymmsl::Port { Port( std::string const & name, ::ymmsl::Operator oper, bool is_vector, bool is_connected, - int our_ndims, std::vector peer_dims, - std::vector num_messages=std::vector(0)); + int our_ndims, std::vector peer_dims); // Note: we only ever use this Port in libmuscle, and only use // ymmsl::Port in ymmsl. Port objects are always handled by value, so @@ -109,6 +108,16 @@ class Port : public ::ymmsl::Port { */ void set_closed(int slot); + /** Restore message counts from a snapshot. + * + * @param num_messages message counts of the snapshot + */ + void restore_message_counts(const std::vector &num_messages); + + /** Get the message counts for all slots in this port + */ + const std::vector & get_message_counts() const; + /** Increment amount of messages sent or received. */ void increment_num_messages(); diff --git a/libmuscle/python/libmuscle/port.py b/libmuscle/python/libmuscle/port.py index 9aced2d5..a6f955a5 100644 --- a/libmuscle/python/libmuscle/port.py +++ b/libmuscle/python/libmuscle/port.py @@ -32,8 +32,7 @@ class Port(ymmsl.Port): """ def __init__(self, name: str, operator: Operator, is_vector: bool, - is_connected: bool, our_ndims: int, peer_dims: List[int], - num_messages: Optional[List[int]] = None + is_connected: bool, our_ndims: int, peer_dims: List[int] ) -> None: """Create a Port. @@ -86,13 +85,8 @@ def __init__(self, name: str, operator: Operator, is_vector: bool, self._is_open = [True] self._is_resizable = is_vector and (our_ndims == len(peer_dims)) - self._num_messages = [] # type: List[int] - self._is_resuming = [] # type: List[bool] - if num_messages is not None: - self._num_messages = num_messages - self._is_resuming = [True] * len(num_messages) - _extend_list_to_size(self._num_messages, self._length or 1, 0) - _extend_list_to_size(self._is_resuming, self._length or 1, False) + self._num_messages = [0] * (self._length or 1) + self._is_resuming = [False] * (self._length or 1) # Note: I'm not sure how this will develop exactly, so this class has some # accessors even if those are un-Pythonic; in the future a simple variable @@ -168,6 +162,19 @@ def set_closed(self, slot: Optional[int] = None) -> None: else: self._is_open = [False] + def restore_message_counts(self, num_messages: List[int]) -> None: + """Restore message counts from a snapshot + """ + self._num_messages = num_messages + self._is_resuming = [True] * len(self._num_messages) + _extend_list_to_size(self._num_messages, self._length or 1, 0) + _extend_list_to_size(self._is_resuming, self._length or 1, False) + + def get_message_counts(self) -> List[int]: + """Get a list of message counts for all slots in this port + """ + return self._num_messages.copy() + def increment_num_messages(self, slot: Optional[int] = None) -> None: """Increment amount of messages sent or received. From d07929f604a0830a58c7c3ad9d9aa240ab9f2780 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 30 Aug 2022 09:22:14 +0200 Subject: [PATCH 019/183] Add a snapshot trigger manager and rename files --- .../python/libmuscle/checkpoint_triggers.py | 337 ++++++++++++++++++ .../python/libmuscle/snapshot_manager.py | 151 -------- ...manager.py => test_checkpoint_triggers.py} | 91 ++++- 3 files changed, 425 insertions(+), 154 deletions(-) create mode 100644 libmuscle/python/libmuscle/checkpoint_triggers.py delete mode 100644 libmuscle/python/libmuscle/snapshot_manager.py rename libmuscle/python/libmuscle/test/{test_snapshot_manager.py => test_checkpoint_triggers.py} (58%) diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py new file mode 100644 index 00000000..214fd872 --- /dev/null +++ b/libmuscle/python/libmuscle/checkpoint_triggers.py @@ -0,0 +1,337 @@ +import bisect +from datetime import datetime, timezone +import logging +import os +import time +from typing import List, Optional, Union + +from ymmsl import CheckpointRange, CheckpointRules, Checkpoints + + +_logger = logging.getLogger(__name__) + + +def _checkpoint_error(description: str) -> None: + if "MUSCLE_DISABLE_CHECKPOINT_ERRORS" in os.environ: + _logger.warning(f"Suppressed checkpoint error: {description}") + else: + raise RuntimeError(description) + + +class CheckpointTrigger: + """Represents a trigger for creating snapshots""" + + def next_checkpoint(self, cur_time: float) -> Optional[float]: + """Calculate the next checkpoint time + + Args: + cur_time: current time. + + Returns: + The time when a next checkpoint should be taken, or None if this + trigger has no checkpoint after cur_time. + """ + raise NotImplementedError() + + def previous_checkpoint(self, cur_time: float) -> Optional[float]: + """Calculate the previous checkpoint time + + Args: + cur_time: current time. + + Returns: + The time when a previous checkpoint should have been taken, or None + if this trigger has no checkpoint after cur_time. + """ + raise NotImplementedError() + + +class AtCheckpointTrigger(CheckpointTrigger): + """Represents a trigger based on an "at" checkpoint rule + + This triggers at the specified times. + """ + + def __init__(self, at: List[Union[float, int]]) -> None: + """Create an "at" checkpoint trigger + + Args: + at: list of checkpoint moments + """ + self._at = at + self._at.sort() # ymmsl already sorts, but just to be sure + + def next_checkpoint(self, cur_time: float) -> Optional[float]: + if cur_time >= self._at[-1]: + return None # no future checkpoint left + idx = bisect.bisect(self._at, cur_time) + return self._at[idx] + + def previous_checkpoint(self, cur_time: float) -> Optional[float]: + if cur_time < self._at[0]: + return None # no previous checkpoint + idx = bisect.bisect(self._at, cur_time) + return self._at[idx - 1] + + +class RangeCheckpointTrigger(CheckpointTrigger): + """Represents a trigger based on a "ranges" checkpoint rule + + This triggers at a range of checkpoint moments. + + Equivalent an "at" rule ``[start, start + step, start + 2*step, ...]`` for + as long as ``start + i*step <= stop``. + + Stop may be omitted, in which case the range is infinite. + + Start may be omitted, in which case the range is equivalent to an "at" rule + ``[..., -n*step, ..., -step, 0, step, 2*step, ...]`` for as long as + ``i*step <= stop``. + + Note: the "every" rule is a special case of a range with start and stop + omitted, and is handled by this class as well + """ + + def __init__(self, range: CheckpointRange) -> None: + """Create a range of checkpoints + + Args: + range: checkpoint range defining start, stop and step. + """ + self._start = range.start + self._stop = range.stop + self._step = range.step + self._last = None # type: Union[int, float, None] + if self._stop is not None: + start = 0 if self._start is None else self._start + diff = self._stop - start + self._last = start + (diff // self._step) * self._step + + def next_checkpoint(self, cur_time: float) -> Optional[float]: + if self._start is not None and cur_time < self._start: + return float(self._start) + if self._last is not None and cur_time >= self._last: + return None + start = 0 if self._start is None else self._start + diff = cur_time - start + return float(start + (diff // self._step + 1) * self._step) + + def previous_checkpoint(self, cur_time: float) -> Optional[float]: + if self._start is not None and cur_time < self._start: + return None + if self._last is not None and cur_time > self._last: + return float(self._last) + start = 0 if self._start is None else self._start + diff = cur_time - start + return float(start + (diff // self._step) * self._step) + + +class CombinedCheckpointTriggers(CheckpointTrigger): + """Checkpoint trigger based on a combination of "every", "at" and "ranges" + """ + + def __init__(self, checkpoint_rules: Optional[CheckpointRules]) -> None: + """Create a new combined checkpoint trigger from the given rules + + Args: + checkpoint_rules: checkpoint rules (from ymmsl) defining "every", + "at", and/or "ranges" rules + """ + self._triggers = [] # type: List[CheckpointTrigger] + if checkpoint_rules is None: + return + if checkpoint_rules.every is not None: + cp_range = CheckpointRange(step=checkpoint_rules.every) + self._triggers.append(RangeCheckpointTrigger(cp_range)) + if checkpoint_rules.at: + self._triggers.append(AtCheckpointTrigger(checkpoint_rules.at)) + for cp_range in checkpoint_rules.ranges: + self._triggers.append(RangeCheckpointTrigger(cp_range)) + + def next_checkpoint(self, cur_time: float) -> Optional[float]: + checkpoints = (trigger.next_checkpoint(cur_time) + for trigger in self._triggers) + # return earliest of all not-None next-checkpoints + return min((checkpoint + for checkpoint in checkpoints + if checkpoint is not None), + default=None) # return None if all triggers return None + + def previous_checkpoint(self, cur_time: float) -> Optional[float]: + checkpoints = (trigger.previous_checkpoint(cur_time) + for trigger in self._triggers) + # return latest of all not-None previous-checkpoints + return max((checkpoint + for checkpoint in checkpoints + if checkpoint is not None), + default=None) # return None if all triggers return None + + +def _utc_to_monotonic(utc: datetime) -> float: + """Convert UTC time point to a reference value of time.monotonic() + + Args: + utc: datetime in UTC timezone + """ + curmono = time.monotonic() + curutc = datetime.now(timezone.utc) + elapsed_seconds = (curutc - utc).total_seconds() + return curmono - elapsed_seconds + + +class TriggerManager: + """Manages all checkpoint triggers and checks if a snapshot must be saved. + """ + + def __init__(self, reference_utctime: datetime, checkpoints: Checkpoints + ) -> None: + self._monotonic_reference = _utc_to_monotonic(reference_utctime) + + self._wall = CombinedCheckpointTriggers(checkpoints.wallclocktime) + self._prevwall = 0.0 + self._nextwall = self._wall.next_checkpoint(0.0) # type: Optional[float] + + self._sim = CombinedCheckpointTriggers(checkpoints.simulationtime) + self._prevsim = None # type: Optional[float] + self._nextsim = None # type: Optional[float] + self._sim_reset = True + + self._last_triggers = [] # type: List[str] + self._first_reuse = True + + # These attributes are only used to check if implementations are + # following the guidelines + self._should_have_saved = False + self._should_save_final_called = False + self._saved_final_checkpoint = False + + def elapsed_walltime(self) -> float: + """Returns elapsed wallclocktime in seconds. + """ + return time.monotonic() - self._monotonic_reference + + def should_save_snapshot(self, timestamp: float, + next_timestamp: Optional[float]) -> bool: + """Handles instance.should_save_snapshot + """ + if self._should_have_saved: + _checkpoint_error('"should_save_snapshot" or ' + '"should_save_final_snapshot" returned positive' + ' but no snapshot was saved before the next call') + + value = False + elapsed_walltime = self.elapsed_walltime() + if next_timestamp is None: + _logger.warning('No "next_timestamp" provided. Workflow may not' + ' be able to create a consistent snapshot. See ' + 'https://muscle3.readthedocs.io/en/latest/checkpoints.html') + value = self.__should_save(elapsed_walltime, timestamp) + else: + value = self.__should_save(elapsed_walltime, next_timestamp) + self._should_have_saved = value + return value + + def should_save_final_snapshot(self, timestamp: float) -> bool: + """Handles instance.should_save_final_snapshot + """ + if self._should_have_saved: + _checkpoint_error('"should_save_snapshot" or ' + '"should_save_final_snapshot" returned positive' + ' but no snapshot was saved before the next call') + + value = False + if self._max_f_init_next_timestamp is None: + # If the messages on F_INIT do not supply a next_timestamp, we will + # always snapshot just before O_I + value = True + self._last_triggers = ['No "next_timestamp" provided on F_INIT' + ' messages'] + else: + elapsed_walltime = self.elapsed_walltime() + value = self.__should_save(elapsed_walltime, + self._max_f_init_next_timestamp) + + self._should_have_saved = value + self._should_save_final_called = True + return value + + def reuse_instance(self, max_f_init_next_timestamp: Optional[float] + ) -> None: + """Cleanup between instance reuse + + Args: + max_f_init_next_timestamp: the maximum next_timestamp of all + messages pre--received during F_INIT. + """ + self._max_f_init_next_timestamp = max_f_init_next_timestamp + + if self._first_reuse: + self._first_reuse = False + else: + if self._should_have_saved: + _checkpoint_error('"should_save_snapshot" or ' + '"should_save_final_snapshot" returned' + ' positive but no snapshot was saved before' + ' exiting the reuse loop.') + if not (self._should_save_final_called or self._saved_final_checkpoint): + _checkpoint_error('You must call "should_save_final" exactly' + ' once in the reuse loop of an instance that' + ' supports checkpointing.') + self._should_save_final_called = False + self._saved_final_checkpoint = False + + def update_checkpoints(self, simulationtime: float, final: bool) -> float: + """Update last and next checkpoint times when a snapshot is made + + Args: + simulationtime: next timestamp as reported by the instance (if + available, otherwise current timestamp) + + Returns: + Current elapsed walltime + """ + self._prevwall = self.elapsed_walltime() + self._nextwall = self._wall.next_checkpoint(self._prevwall) + + if final and self._max_f_init_next_timestamp is not None: + simulationtime = self._max_f_init_next_timestamp + self._prevsim = simulationtime + self._nextsim = self._sim.next_checkpoint(simulationtime) + + self._should_have_saved = False + self._saved_final_checkpoint = final + return self._prevwall + + def get_triggers(self) -> List[str]: + """Get trigger description(s) for the current reason for checkpointing. + """ + triggers = self._last_triggers + self._last_triggers = [] + return triggers + + def __should_save(self, walltime: float, simulationtime: float) -> bool: + """Check if a checkpoint should be taken + + Args: + walltime: current wallclock time (elapsed since reference) + simulationtime: current/next timestamp as reported by the instance + """ + if self._sim_reset: + # we cannot make assumptions about the start time of a simulation, + # a t=-1000 could make sense if t represents years since CE + # and we should not disallow checkpointing for negative t + previous = self._sim.previous_checkpoint(simulationtime) + if previous is not None: + # there is a checkpoint rule before the current moment, assume + # we should have taken a snapshot back then + self._nextsim = previous + else: + self._nextsim = self._sim.next_checkpoint(simulationtime) + self._sim_reset = False + + self._last_triggers = [] + if self._nextwall is not None and walltime >= self._nextwall: + self._last_triggers.append(f"wallclocktime >= {self._nextwall}") + if self._nextsim is not None and simulationtime >= self._nextsim: + self._last_triggers.append(f"simulationtime >= {self._nextsim}") + return bool(self._last_triggers) diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py deleted file mode 100644 index 36b742d8..00000000 --- a/libmuscle/python/libmuscle/snapshot_manager.py +++ /dev/null @@ -1,151 +0,0 @@ -import bisect -from typing import List, Optional, Union - -from ymmsl import CheckpointRange, CheckpointRules - - -class CheckpointTrigger: - """Represents a trigger for creating snapshots""" - - def next_checkpoint(self, cur_time: float) -> Optional[float]: - """Calculate the next checkpoint time - - Args: - cur_time: current time. - - Returns: - The time when a next checkpoint should be taken, or None if this - trigger has no checkpoint after cur_time. - """ - raise NotImplementedError() - - def previous_checkpoint(self, cur_time: float) -> Optional[float]: - """Calculate the previous checkpoint time - - Args: - cur_time: current time. - - Returns: - The time when a previous checkpoint should have been taken, or None - if this trigger has no checkpoint after cur_time. - """ - raise NotImplementedError() - - -class AtCheckpointTrigger(CheckpointTrigger): - """Represents a trigger based on an "at" checkpoint rule - - This triggers at the specified times. - """ - - def __init__(self, at: List[Union[float, int]]) -> None: - """Create an "at" checkpoint trigger - - Args: - at: list of checkpoint moments - """ - self._at = at - self._at.sort() # ymmsl already sorts, but just to be sure - - def next_checkpoint(self, cur_time: float) -> Optional[float]: - if cur_time >= self._at[-1]: - return None # no future checkpoint left - idx = bisect.bisect(self._at, cur_time) - return self._at[idx] - - def previous_checkpoint(self, cur_time: float) -> Optional[float]: - if cur_time < self._at[0]: - return None # no previous checkpoint - idx = bisect.bisect(self._at, cur_time) - return self._at[idx - 1] - - -class RangeCheckpointTrigger(CheckpointTrigger): - """Represents a trigger based on a "ranges" checkpoint rule - - This triggers at a range of checkpoint moments. - - Equivalent an "at" rule ``[start, start + step, start + 2*step, ...]`` for - as long as ``start + i*step <= stop``. - - Stop may be omitted, in which case the range is infinite. - - Start may be omitted, in which case the range is equivalent to an "at" rule - ``[..., -n*step, ..., -step, 0, step, 2*step, ...]`` for as long as - ``i*step <= stop``. - - Note: the "every" rule is a special case of a range with start and stop - omitted, and is handled by this class as well - """ - - def __init__(self, range: CheckpointRange) -> None: - """Create a range of checkpoints - - Args: - range: checkpoint range defining start, stop and step. - """ - self._start = range.start - self._stop = range.stop - self._step = range.step - self._last = None # type: Union[int, float, None] - if self._stop is not None: - start = 0 if self._start is None else self._start - diff = self._stop - start - self._last = start + (diff // self._step) * self._step - - def next_checkpoint(self, cur_time: float) -> Optional[float]: - if self._start is not None and cur_time < self._start: - return float(self._start) - if self._last is not None and cur_time >= self._last: - return None - start = 0 if self._start is None else self._start - diff = cur_time - start - return float(start + (diff // self._step + 1) * self._step) - - def previous_checkpoint(self, cur_time: float) -> Optional[float]: - if self._start is not None and cur_time < self._start: - return None - if self._last is not None and cur_time > self._last: - return float(self._last) - start = 0 if self._start is None else self._start - diff = cur_time - start - return float(start + (diff // self._step) * self._step) - - -class CombinedCheckpointTriggers(CheckpointTrigger): - """Checkpoint trigger based on a combination of "every", "at" and "ranges" - """ - - def __init__(self, checkpoint_rules: CheckpointRules) -> None: - """Create a new combined checkpoint trigger from the given rules - - Args: - checkpoint_rules: checkpoint rules (from ymmsl) defining "every", - "at", and/or "ranges" rules - """ - self._triggers = [] # type: List[CheckpointTrigger] - if checkpoint_rules.every is not None: - cp_range = CheckpointRange(step=checkpoint_rules.every) - self._triggers.append(RangeCheckpointTrigger(cp_range)) - if checkpoint_rules.at: - self._triggers.append(AtCheckpointTrigger(checkpoint_rules.at)) - for cp_range in checkpoint_rules.ranges: - self._triggers.append(RangeCheckpointTrigger(cp_range)) - - def next_checkpoint(self, cur_time: float) -> Optional[float]: - checkpoints = (trigger.next_checkpoint(cur_time) - for trigger in self._triggers) - # return earliest of all not-None next-checkpoints - return min((checkpoint - for checkpoint in checkpoints - if checkpoint is not None), - default=None) # return None if all triggers return None - - def previous_checkpoint(self, cur_time: float) -> Optional[float]: - checkpoints = (trigger.previous_checkpoint(cur_time) - for trigger in self._triggers) - # return latest of all not-None previous-checkpoints - return max((checkpoint - for checkpoint in checkpoints - if checkpoint is not None), - default=None) # return None if all triggers return None diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py similarity index 58% rename from libmuscle/python/libmuscle/test/test_snapshot_manager.py rename to libmuscle/python/libmuscle/test/test_checkpoint_triggers.py index b557e5b8..6cb645c2 100644 --- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py +++ b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py @@ -1,8 +1,12 @@ +from datetime import datetime, timedelta, timezone +import logging +import time import pytest -from ymmsl import CheckpointRange, CheckpointRules +from ymmsl import CheckpointRange, CheckpointRules, Checkpoints -from libmuscle.snapshot_manager import ( - CombinedCheckpointTriggers, AtCheckpointTrigger, RangeCheckpointTrigger) +from libmuscle.checkpoint_triggers import ( + CombinedCheckpointTriggers, AtCheckpointTrigger, RangeCheckpointTrigger, + TriggerManager) def test_at_checkpoint_trigger(): @@ -131,3 +135,84 @@ def test_combined_checkpoint_trigger_at_ranges(): assert trigger.next_checkpoint(125.2) is None assert trigger.previous_checkpoint(125.2) == pytest.approx(100) + + +def test_trigger_manager_reference_time(): + monotonic_now = time.monotonic() + utcnow = datetime.now(timezone.utc) + reference = utcnow - timedelta(seconds=15) + trigger_manager = TriggerManager(reference, Checkpoints()) + elapsed_walltime = trigger_manager.elapsed_walltime() + elapsed_monotonic = time.monotonic() - monotonic_now + assert 15.0 < elapsed_walltime <= (15.0 + elapsed_monotonic) + + +def test_trigger_manager(): + reference = datetime.now(timezone.utc) + trigger_manager = TriggerManager(reference, Checkpoints( + wallclocktime=CheckpointRules(at=[1e-12]), + simulationtime=CheckpointRules(at=[1, 3, 5]))) + + trigger_manager.reuse_instance(7) + + t, t_next = 0.1, 0.2 + assert trigger_manager.should_save_snapshot(t, t_next) + triggers = trigger_manager.get_triggers() + assert len(triggers) == 1 + assert "wallclocktime" in triggers[0] + with pytest.raises(RuntimeError): # did not call save in between + trigger_manager.should_save_snapshot(t, t_next) + trigger_manager.update_checkpoints(t_next, False) + + t, t_next = 0.2, 0.9 + assert not trigger_manager.should_save_snapshot(t, t_next) + + t, t_next = 0.9, 3.1 + assert trigger_manager.should_save_snapshot(t, t_next) + assert len(trigger_manager.get_triggers()) == 1 + trigger_manager.update_checkpoints(t_next, False) + + t, t_next = 3.1, None + assert trigger_manager.should_save_final_snapshot(t) + with pytest.raises(RuntimeError): # did not call save in between + trigger_manager.should_save_snapshot(t, 4.0) + with pytest.raises(RuntimeError): # did not call save in between + trigger_manager.should_save_final_snapshot(t) + assert len(trigger_manager.get_triggers()) > 0 + trigger_manager.update_checkpoints(t, True) + + trigger_manager.reuse_instance(None) + + t, t_next = 7.1, 8.2 + assert not trigger_manager.should_save_snapshot(t, t_next) + with pytest.raises(RuntimeError): # no should_save_final called + trigger_manager.reuse_instance(None) + t, t_next = 8.2, None + assert trigger_manager.should_save_final_snapshot(t) + with pytest.raises(RuntimeError): # not saved + trigger_manager.reuse_instance(None) + trigger_manager.update_checkpoints(t, True) + + trigger_manager.reuse_instance(None) + + +def test_trigger_manager_warnings(caplog: pytest.LogCaptureFixture, + monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv("MUSCLE_DISABLE_CHECKPOINT_ERRORS", "1") + + reference = datetime.now(timezone.utc) + trigger_manager = TriggerManager(reference, Checkpoints( + simulationtime=CheckpointRules(at=[1, 3, 5]))) + + trigger_manager.reuse_instance(2) + + with caplog.at_level(logging.WARN): + n_records = len(caplog.records) + assert trigger_manager.should_save_snapshot(1.5, None) + assert len(caplog.records) == n_records + 1 + assert "next_timestamp" in caplog.records[-1].message + + n_records = len(caplog.records) + trigger_manager.reuse_instance(None) # suppressed error + assert len(caplog.records) > n_records + assert "Suppressed checkpoint error" in caplog.records[-1].message From 54c5e1ec4eef2d3348ac78ad974cdc4af73a7f75 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 30 Aug 2022 15:17:44 +0200 Subject: [PATCH 020/183] Communicator get/restore port message counts And unit tests for communicator changes --- libmuscle/python/libmuscle/communicator.py | 24 ++++ .../libmuscle/test/test_communicator.py | 127 ++++++++++++++++++ 2 files changed, 151 insertions(+) diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py index f565f6c5..7ffff004 100644 --- a/libmuscle/python/libmuscle/communicator.py +++ b/libmuscle/python/libmuscle/communicator.py @@ -366,6 +366,30 @@ def shutdown(self) -> None: for server in self._servers: server.close() + def restore_message_counts(self, port_message_counts: Dict[str, List[int]] + ) -> None: + """Restore message counts on all ports + """ + for port_name, num_messages in port_message_counts.items(): + if port_name == "muscle_settings_in": + self._muscle_settings_in.restore_message_counts(num_messages) + elif port_name in self._ports: + self._ports[port_name].restore_message_counts(num_messages) + else: + raise RuntimeError(f'Unknown port {port_name} in snapshot.' + ' Have your port definitions changed since' + ' the snapshot was taken?') + # TODO decide if we should check whether all ports are covered + + def get_message_counts(self) -> Dict[str, List[int]]: + """Get message counts for all ports on the communicator + """ + port_message_counts = {port_name: port.get_message_counts() + for port_name, port in self._ports.items()} + port_message_counts["muscle_settings_in"] = \ + self._muscle_settings_in.get_message_counts() + return port_message_counts + def __instance_id(self) -> Reference: """Returns our complete instance id. """ diff --git a/libmuscle/python/libmuscle/test/test_communicator.py b/libmuscle/python/libmuscle/test/test_communicator.py index a4f3a751..8fb7a527 100644 --- a/libmuscle/python/libmuscle/test/test_communicator.py +++ b/libmuscle/python/libmuscle/test/test_communicator.py @@ -1,3 +1,4 @@ +import logging from libmuscle.communicator import Communicator, Endpoint, Message from libmuscle.mpp_message import ClosePort, MPPMessage from libmuscle.port import Port @@ -67,6 +68,8 @@ def gpe(p, s) -> Reference: communicator._ports = { 'out': Port('out', Operator.O_I, False, True, 1, []), 'in': Port('in', Operator.S, False, True, 1, [])} + communicator._muscle_settings_in = \ + communicator._Communicator__settings_in_port([]) yield communicator communicator.shutdown() @@ -103,6 +106,8 @@ def gpe(p, s) -> Reference: communicator._ports = { 'out': Port('out', Operator.O_I, True, True, 0, [20]), 'in': Port('in', Operator.S, True, True, 0, [20])} + communicator._muscle_settings_in = \ + communicator._Communicator__settings_in_port([]) yield communicator communicator.shutdown() @@ -554,3 +559,125 @@ def test_get_message(communicator, message) -> None: None, 0.0, None, Settings(), 0, b'test').encoded() assert communicator._post_office.get_message( 'other.in[13]') == ref_message + + +def test_port_message_counts(communicator, message) -> None: + communicator.send_message('out', message) + msg_counts = communicator.get_message_counts() + assert msg_counts == {'out': [1], + 'in': [0], + 'muscle_settings_in': [0]} + + communicator.restore_message_counts({'out': [3], + 'in': [2], + 'muscle_settings_in': [4]}) + communicator.send_message('out', message) + msg_counts = communicator.get_message_counts() + assert msg_counts == {'out': [4], + 'in': [2], + 'muscle_settings_in': [4]} + + # empty post office + communicator._post_office.get_message('other.in[13]') + communicator._post_office.get_message('other.in[13]') + + with pytest.raises(RuntimeError): + communicator.restore_message_counts({"x?invalid_port": 3}) + + +def test_vector_port_message_counts(communicator2, message) -> None: + msg_counts = communicator2.get_message_counts() + assert msg_counts == {'out': [0] * 20, + 'in': [0] * 20, + 'muscle_settings_in': [0]} + + communicator2.send_message('out', message, 13) + msg_counts = communicator2.get_message_counts() + assert msg_counts == {'out': [0] * 13 + [1] + [0] * 6, + 'in': [0] * 20, + 'muscle_settings_in': [0]} + + communicator2.restore_message_counts({'out': list(range(20)), + 'in': list(range(20)), + 'muscle_settings_in': [4]}) + communicator2.send_message('out', message, 13) + msg_counts = communicator2.get_message_counts() + assert msg_counts == {'out': list(range(13)) + [14] + list(range(14, 20)), + 'in': list(range(20)), + 'muscle_settings_in': [4]} + + # empty post office + communicator2._post_office.get_message('kernel[13].in') + communicator2._post_office.get_message('kernel[13].in') + + +def test_port_count_validation(communicator): + client_mock = MagicMock() + client_mock.receive.return_value = MPPMessage( + Reference('other.out[13]'), Reference('kernel[13].in'), + None, 0.0, None, Settings({'test1': 12}), 0, + b'test').encoded() + get_client_mock = MagicMock(return_value=client_mock) + communicator._Communicator__get_client = get_client_mock + communicator._profiler = MagicMock() + + communicator.receive_message('in') + assert communicator.get_message_counts()['in'] == [1] + + with pytest.raises(RuntimeError): + # the message received has message_number = 0 again + communicator.receive_message('in') + + +def test_port_discard_error_on_resume(caplog, communicator): + client_mock = MagicMock() + client_mock.receive.return_value = MPPMessage( + Reference('other.out[13]'), Reference('kernel[13].in'), + None, 0.0, None, Settings({'test1': 12}), 1, + b'test').encoded() + get_client_mock = MagicMock(return_value=client_mock) + communicator._Communicator__get_client = get_client_mock + communicator._profiler = MagicMock() + + communicator.restore_message_counts({'out': [0], + 'in': [2], + 'muscle_settings_in': [0]}) + for port in communicator._ports.values(): + assert port._is_resuming == [True] + assert port.is_resuming(None) + + # In the next block, the first message with message_number=1 is discarded. + # The RuntimeError is raised when 'receiving' the second message with + # message_number=1 + with caplog.at_level(logging.DEBUG): + with pytest.raises(RuntimeError): + communicator.receive_message('in') + # records 0, 2 and 3 are debug logs for starting/receiving on port + assert 'Discarding received message' in caplog.records[1].message + + +def test_port_discard_success_on_resume(caplog, communicator): + client_mock = MagicMock() + client_mock.receive.side_effect = [MPPMessage( + Reference('other.out[13]'), Reference('kernel[13].in'), + None, 0.0, None, Settings({'test1': 12}), message_number, + {'this is message': message_number}).encoded() + for message_number in [1, 2]] + get_client_mock = MagicMock(return_value=client_mock) + communicator._Communicator__get_client = get_client_mock + communicator._profiler = MagicMock() + + communicator.restore_message_counts({'out': [0], + 'in': [2], + 'muscle_settings_in': [0]}) + for port in communicator._ports.values(): + assert port._is_resuming == [True] + assert port.is_resuming(None) + + with caplog.at_level(logging.DEBUG): + msg = communicator.receive_message('in') + # records 0, 2 and 3 are debug logs for starting/receiving on port + assert 'Discarding received message' in caplog.records[1].message + # message_number=1 should be discarded: + assert msg.data == {'this is message': 2} + assert communicator.get_message_counts()['in'] == [3] From dbaaf94fbfae8c82f00e3ae453a95f7b52e7b824 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 30 Aug 2022 16:10:48 +0200 Subject: [PATCH 021/183] caplog fix when running full test suite --- libmuscle/python/libmuscle/test/test_communicator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libmuscle/python/libmuscle/test/test_communicator.py b/libmuscle/python/libmuscle/test/test_communicator.py index 8fb7a527..bb41cfa5 100644 --- a/libmuscle/python/libmuscle/test/test_communicator.py +++ b/libmuscle/python/libmuscle/test/test_communicator.py @@ -649,7 +649,7 @@ def test_port_discard_error_on_resume(caplog, communicator): # In the next block, the first message with message_number=1 is discarded. # The RuntimeError is raised when 'receiving' the second message with # message_number=1 - with caplog.at_level(logging.DEBUG): + with caplog.at_level(logging.DEBUG, 'libmuscle.communicator'): with pytest.raises(RuntimeError): communicator.receive_message('in') # records 0, 2 and 3 are debug logs for starting/receiving on port @@ -674,7 +674,7 @@ def test_port_discard_success_on_resume(caplog, communicator): assert port._is_resuming == [True] assert port.is_resuming(None) - with caplog.at_level(logging.DEBUG): + with caplog.at_level(logging.DEBUG, 'libmuscle.communicator'): msg = communicator.receive_message('in') # records 0, 2 and 3 are debug logs for starting/receiving on port assert 'Discarding received message' in caplog.records[1].message From c6058c0cf5ef721322c6f0066ec46eb7d59adb0b Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 30 Aug 2022 16:12:37 +0200 Subject: [PATCH 022/183] Handle simulationtime in TriggerManager --- .../python/libmuscle/checkpoint_triggers.py | 21 +++++++++++-------- .../test/test_checkpoint_triggers.py | 8 +++---- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py index 214fd872..8bf343fb 100644 --- a/libmuscle/python/libmuscle/checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/checkpoint_triggers.py @@ -183,9 +183,9 @@ class TriggerManager: """Manages all checkpoint triggers and checks if a snapshot must be saved. """ - def __init__(self, reference_utctime: datetime, checkpoints: Checkpoints + def __init__(self, utc_reference: datetime, checkpoints: Checkpoints ) -> None: - self._monotonic_reference = _utc_to_monotonic(reference_utctime) + self._monotonic_reference = _utc_to_monotonic(utc_reference) self._wall = CombinedCheckpointTriggers(checkpoints.wallclocktime) self._prevwall = 0.0 @@ -280,27 +280,30 @@ def reuse_instance(self, max_f_init_next_timestamp: Optional[float] self._should_save_final_called = False self._saved_final_checkpoint = False - def update_checkpoints(self, simulationtime: float, final: bool) -> float: + def update_checkpoints(self, timestamp: float, + next_timestamp: Optional[float], final: bool + ) -> None: """Update last and next checkpoint times when a snapshot is made Args: - simulationtime: next timestamp as reported by the instance (if - available, otherwise current timestamp) - - Returns: - Current elapsed walltime + timestamp: timestamp as reported by the instance + next_timestamp: next timestamp as reported by the instance """ self._prevwall = self.elapsed_walltime() self._nextwall = self._wall.next_checkpoint(self._prevwall) if final and self._max_f_init_next_timestamp is not None: simulationtime = self._max_f_init_next_timestamp + else: + if next_timestamp is None: + simulationtime = timestamp + else: + simulationtime = next_timestamp self._prevsim = simulationtime self._nextsim = self._sim.next_checkpoint(simulationtime) self._should_have_saved = False self._saved_final_checkpoint = final - return self._prevwall def get_triggers(self) -> List[str]: """Get trigger description(s) for the current reason for checkpointing. diff --git a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py index 6cb645c2..17afd0b1 100644 --- a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py @@ -162,7 +162,7 @@ def test_trigger_manager(): assert "wallclocktime" in triggers[0] with pytest.raises(RuntimeError): # did not call save in between trigger_manager.should_save_snapshot(t, t_next) - trigger_manager.update_checkpoints(t_next, False) + trigger_manager.update_checkpoints(t, t_next, False) t, t_next = 0.2, 0.9 assert not trigger_manager.should_save_snapshot(t, t_next) @@ -170,7 +170,7 @@ def test_trigger_manager(): t, t_next = 0.9, 3.1 assert trigger_manager.should_save_snapshot(t, t_next) assert len(trigger_manager.get_triggers()) == 1 - trigger_manager.update_checkpoints(t_next, False) + trigger_manager.update_checkpoints(t, t_next, False) t, t_next = 3.1, None assert trigger_manager.should_save_final_snapshot(t) @@ -179,7 +179,7 @@ def test_trigger_manager(): with pytest.raises(RuntimeError): # did not call save in between trigger_manager.should_save_final_snapshot(t) assert len(trigger_manager.get_triggers()) > 0 - trigger_manager.update_checkpoints(t, True) + trigger_manager.update_checkpoints(t, t_next, True) trigger_manager.reuse_instance(None) @@ -191,7 +191,7 @@ def test_trigger_manager(): assert trigger_manager.should_save_final_snapshot(t) with pytest.raises(RuntimeError): # not saved trigger_manager.reuse_instance(None) - trigger_manager.update_checkpoints(t, True) + trigger_manager.update_checkpoints(t, t_next, True) trigger_manager.reuse_instance(None) From c67e2543378dd34a042ec28a371381f4ac890f07 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 30 Aug 2022 16:55:46 +0200 Subject: [PATCH 023/183] Implement SnapshotManager --- libmuscle/python/libmuscle/mmp_client.py | 5 + libmuscle/python/libmuscle/snapshot.py | 103 +++++++++ .../python/libmuscle/snapshot_manager.py | 215 ++++++++++++++++++ 3 files changed, 323 insertions(+) create mode 100644 libmuscle/python/libmuscle/snapshot.py create mode 100644 libmuscle/python/libmuscle/snapshot_manager.py diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py index fd236b52..0d3108c9 100644 --- a/libmuscle/python/libmuscle/mmp_client.py +++ b/libmuscle/python/libmuscle/mmp_client.py @@ -9,6 +9,7 @@ from libmuscle.mcp.tcp_transport_client import TcpTransportClient from libmuscle.profiling import ProfileEvent from libmuscle.logging import LogMessage +from libmuscle.snapshot import SnapshotMetadata CONNECTION_TIMEOUT = 300 @@ -93,6 +94,10 @@ def submit_profile_events(self, events: Iterable[ProfileEvent]) -> None: [encode_profile_event(e) for e in events]] self._call_manager(request) + def submit_snapshot_metadata(self, snapshot_metadata: SnapshotMetadata + ) -> None: + ... # TODO + def get_settings(self) -> Settings: """Get the central settings from the manager. diff --git a/libmuscle/python/libmuscle/snapshot.py b/libmuscle/python/libmuscle/snapshot.py new file mode 100644 index 00000000..f9e8966f --- /dev/null +++ b/libmuscle/python/libmuscle/snapshot.py @@ -0,0 +1,103 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import TYPE_CHECKING, Dict, List, Optional, cast + +import msgpack + +if TYPE_CHECKING: + # prevent circular import + from libmuscle.communicator import Message + + +class Snapshot(ABC): + """Snapshot data structure. + + This is an abstract base class, implementations are provided by subclasses. + """ + SNAPSHOT_VERSION_BYTE = b'\0' + + def __init__(self, + triggers: List[str], + wallclocktime: float, + port_message_counts: Dict[str, List[int]], + is_final_snapshot: bool, + message: 'Message') -> None: + self.triggers = triggers + self.wallclocktime = wallclocktime + self.port_message_counts = port_message_counts + self.is_final_snapshot = is_final_snapshot + self.message = message + + @classmethod + @abstractmethod + def from_bytes(cls, data: bytes) -> 'Snapshot': + """Create a snapshot object from binary data. + + Args: + data: binary data representing the snapshot. Note that this must + **exclude** the versioning byte. + """ + ... + + @abstractmethod + def to_bytes(self) -> bytes: + """Convert the snapshot object to binary data. + + Returns: + Binary data representing the snapshot. Note that this must + **exclude** the versioning byte. + """ + ... + + +class MsgPackSnapshot(Snapshot): + """Snapshot stored in messagepack format + """ + SNAPSHOT_VERSION_BYTE = b'1' + + @classmethod + def from_bytes(cls, data: bytes) -> 'Snapshot': + dct = msgpack.loads(data) + return cls(dct['triggers'], + dct['wallclocktime'], + dct['port_message_counts'], + dct['is_final_snapshot'], + dct['message']) + + def to_bytes(self) -> bytes: + return cast(bytes, msgpack.dumps({ + 'triggers': self.triggers, + 'wallclocktime': self.wallclocktime, + 'port_message_counts': self.port_message_counts, + 'is_final_snapshot': self.is_final_snapshot, + 'message': self.message + })) + + +@dataclass +class SnapshotMetadata: + """Metadata of a snapshot for sending to the muscle_manager. + """ + triggers: List[str] + wallclocktime: float + timestamp: float + next_timestamp: Optional[float] + port_message_counts: Dict[str, List[int]] + is_final_snapshot: bool + # storing as str, because Path cannot be serialized by msgpack + snapshot_filename: str + + @staticmethod + def from_snapshot(snapshot: Snapshot, snapshot_filename: str + ) -> 'SnapshotMetadata': + """Create snapshot metadata from the given snapshot and filename + """ + return SnapshotMetadata( + snapshot.triggers, + snapshot.wallclocktime, + snapshot.message.timestamp, + snapshot.message.next_timestamp, + snapshot.port_message_counts, + snapshot.is_final_snapshot, + snapshot_filename + ) diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py new file mode 100644 index 00000000..afce7908 --- /dev/null +++ b/libmuscle/python/libmuscle/snapshot_manager.py @@ -0,0 +1,215 @@ +import logging +from datetime import datetime +from pathlib import Path +from typing import Optional, cast + +from ymmsl import Checkpoints, Reference + +from libmuscle.checkpoint_triggers import TriggerManager +from libmuscle.communicator import Communicator, Message +from libmuscle.mmp_client import MMPClient +from libmuscle.snapshot import MsgPackSnapshot, Snapshot, SnapshotMetadata + +_logger = logging.getLogger(__name__) + +_MAX_FILE_EXISTS_CHECK = 10000 + + +class SnapshotManager: + """Manages information on snapshots for the Instance + + Implements the public checkpointing API with handoffs to + :class:`TriggerManager` for checkpoint triggers. + """ + + def __init__(self, + instance_id: Reference, + manager: MMPClient, + communicator: Communicator) -> None: + """Create a new snapshot manager + + Args: + instance_id: The id of this instance. + manager: The client used to submit data to the manager. + communicator: The communicator belonging to this instance. + """ + self._instance_id = instance_id + # replace identifier[i] by identifier-i to use in snapshot file name + # using a dash (-) because that is not allowed in Identifiers + self._safe_id = str(instance_id).replace("[", "-").replace("]", "") + self._communicator = communicator + self._manager = manager + + self._first_reuse = True + self._resume_from_snapshot = None # type: Optional[Snapshot] + self._trigger = None # type: Optional[TriggerManager] + self._snapshot_directory = None # type: Optional[Path] + self._next_snapshot_num = 1 + + def registered(self, + utc_reference: datetime, + checkpoints: Checkpoints, + resume: Optional[Path]) -> None: + """Callback after registering with the manager. + + Provide the snapshot manager with info on workflow checkpoints and if we + should resume from a previous snapshot. + + Args: + utc_reference: datetime (in UTC timezone) indicating wallclocktime=0 + checkpoints: requested workflow checkpoints + resume: previous snapshot to resume from (or None if not resuming) + """ + if checkpoints: + self._trigger = TriggerManager(utc_reference, checkpoints) + if resume is not None: + self.__load_snapshot(resume) + snapshot = cast(Snapshot, self._resume_from_snapshot) + self._communicator.restore_message_counts( + snapshot.port_message_counts) + + def reuse_instance(self, + max_f_init_next_timestamp: Optional[float], + snapshot_directory: Path, + ) -> None: + """Callback on Instance.reuse_instance + + Args: + max_f_init_next_timestamp: maximum next_timestamp of all F_INIT + messages. May be None if no message has next_timestamp set or + if no F_INIT messages were received. + """ + if self._trigger is not None: + self._trigger.reuse_instance(max_f_init_next_timestamp) + + self._snapshot_directory = snapshot_directory + + if self._first_reuse: + self._first_reuse = False + else: + self._resume_from_snapshot = None + + def resuming(self) -> bool: + """Check if we are resuming during this reuse iteration. + """ + return self._resume_from_snapshot is not None + + def load_snapshot(self) -> Message: + """Get the Message to resume from + """ + if self._resume_from_snapshot is None: + raise RuntimeError('No snapshot to load. Use "instance.resuming()"' + ' to check if a snapshot is available') + return self._resume_from_snapshot.message + + def should_save_snapshot(self, timestamp: float, + next_timestamp: Optional[float]) -> bool: + """See :meth:`TriggerManager.should_save_snapshot` + """ + if self._trigger is None: + return False # checkpointing disabled + return self._trigger.should_save_snapshot(timestamp, next_timestamp) + + def should_save_final_snapshot(self, timestamp: float) -> bool: + """See :meth:`TriggerManager.should_save_final_snapshot` + """ + if self._trigger is None: + return False # checkpointing disabled + return self._trigger.should_save_final_snapshot(timestamp) + + def save_snapshot(self, msg: Message) -> None: + """Save snapshot contained in the message object. + """ + self.__save_snapshot(msg, False) + + def save_final_snapshot(self, msg: Message) -> None: + """Save final snapshot contained in the message object + """ + self.__save_snapshot(msg, True) + + def __save_snapshot(self, msg: Message, final: bool) -> None: + """Actual implementation used by save_(final_)snapshot. + + Args: + msg: message object representing the snapshot + final: True iff called from save_final_snapshot + """ + if self._trigger is None: + _logger.warning('Saving a snapshot but no checkpoints requested' + ' by the workflow.') + triggers = [] + wallclocktime = 0.0 + else: + triggers = self._trigger.get_triggers() + wallclocktime = self._trigger.elapsed_walltime() + + port_message_counts = self._communicator.get_message_counts() + snapshot = MsgPackSnapshot( + triggers, wallclocktime, port_message_counts, final, msg) + + path = self.__store_snapshot(snapshot) + metadata = SnapshotMetadata.from_snapshot(snapshot, str(path)) + self._manager.submit_snapshot_metadata(metadata) + + if self._trigger is not None: + self._trigger.update_checkpoints( + msg.timestamp, msg.next_timestamp, final) + + def __load_snapshot(self, snapshot_location: Path) -> None: + """Load a previously stored snapshot from the filesystem + + Args: + snapshot_location: path where the snapshot is stored + """ + if not snapshot_location.is_file(): + raise RuntimeError(f'Unable to load snapshot: {snapshot_location}' + ' is not a file. Please ensure this path exists' + ' and can be read.') + + # TODO: encapsulate I/O errors? + with snapshot_location.open("rb") as snapshot_file: + version = snapshot_file.read(1) + data = snapshot_file.read() + + if version == MsgPackSnapshot.SNAPSHOT_VERSION_BYTE: + self._resume_from_snapshot = MsgPackSnapshot.from_bytes(data) + else: + raise RuntimeError('Unable to load snapshot from' + f' {snapshot_location}: unknown version of' + ' snapshot file. Was the file saved with a' + ' different version of libmuscle or' + ' tampered with?') + + def __store_snapshot(self, snapshot: Snapshot) -> Path: + """Store a snapshot on the filesystem + + Args: + snapshot: snapshot to store + + Returns: + Path where the snapshot is stored + """ + if self._snapshot_directory is None: + raise RuntimeError('Unknown snapshot directory. Did you try to' + ' save a snapshot before entering the reuse' + ' loop?') + for _ in range(_MAX_FILE_EXISTS_CHECK): + # Expectation is that muscle_snapshot_directory is empty initially + # and we succeed in the first loop. Still wrapping in a for-loop + # such that an existing filename doesn't immediately raise an error + fname = f"{self._safe_id}_{self._next_snapshot_num}.pack" + fpath = self._snapshot_directory / fname + self._next_snapshot_num += 1 + if not fpath.exists(): + break + else: + raise RuntimeError('Could not find an available filename for' + f' storing the next snapshot: {fpath} already' + ' exists.') + # Opening with mode 'x' since a file with the same name may be created + # in the small window between checking above and opening here. It is + # better to fail with an error than to overwrite an existing file. + with fpath.open('xb') as snapshot_file: + snapshot_file.write(snapshot.SNAPSHOT_VERSION_BYTE) + snapshot_file.write(snapshot.to_bytes()) + return fpath From f7bd7e11b69e7b6daade72374be442a8660f412a Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 30 Aug 2022 17:28:47 +0200 Subject: [PATCH 024/183] Fix flake8 issue --- libmuscle/python/libmuscle/mmp_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py index 0d3108c9..6771b1dd 100644 --- a/libmuscle/python/libmuscle/mmp_client.py +++ b/libmuscle/python/libmuscle/mmp_client.py @@ -96,7 +96,7 @@ def submit_profile_events(self, events: Iterable[ProfileEvent]) -> None: def submit_snapshot_metadata(self, snapshot_metadata: SnapshotMetadata ) -> None: - ... # TODO + ... # TODO def get_settings(self) -> Settings: """Get the central settings from the manager. From 9a7db1b9c63c6c43052b889d2b22a7e23eb7ac50 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 30 Aug 2022 17:29:24 +0200 Subject: [PATCH 025/183] Add snapshot tests --- libmuscle/python/libmuscle/snapshot.py | 31 +++++++++--- .../python/libmuscle/test/test_snapshot.py | 50 +++++++++++++++++++ 2 files changed, 74 insertions(+), 7 deletions(-) create mode 100644 libmuscle/python/libmuscle/test/test_snapshot.py diff --git a/libmuscle/python/libmuscle/snapshot.py b/libmuscle/python/libmuscle/snapshot.py index f9e8966f..560e1129 100644 --- a/libmuscle/python/libmuscle/snapshot.py +++ b/libmuscle/python/libmuscle/snapshot.py @@ -1,12 +1,12 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, List, Optional, cast +from typing import Dict, List, Optional, cast import msgpack +from ymmsl import Reference, Settings -if TYPE_CHECKING: - # prevent circular import - from libmuscle.communicator import Message +from libmuscle.mpp_message import MPPMessage +from libmuscle import communicator class Snapshot(ABC): @@ -21,7 +21,7 @@ def __init__(self, wallclocktime: float, port_message_counts: Dict[str, List[int]], is_final_snapshot: bool, - message: 'Message') -> None: + message: 'communicator.Message') -> None: self.triggers = triggers self.wallclocktime = wallclocktime self.port_message_counts = port_message_counts @@ -62,7 +62,7 @@ def from_bytes(cls, data: bytes) -> 'Snapshot': dct['wallclocktime'], dct['port_message_counts'], dct['is_final_snapshot'], - dct['message']) + cls.bytes_to_message(dct['message'])) def to_bytes(self) -> bytes: return cast(bytes, msgpack.dumps({ @@ -70,9 +70,26 @@ def to_bytes(self) -> bytes: 'wallclocktime': self.wallclocktime, 'port_message_counts': self.port_message_counts, 'is_final_snapshot': self.is_final_snapshot, - 'message': self.message + 'message': self.message_to_bytes(self.message) })) + @staticmethod + def message_to_bytes(message: 'communicator.Message') -> bytes: + """Use MPPMessage serializer for serializing the message object + """ + return MPPMessage(Reference('_'), Reference('_'), None, + message.timestamp, message.next_timestamp, + Settings(), 0, message.data).encoded() + + @staticmethod + def bytes_to_message(data: bytes) -> 'communicator.Message': + """Use MPPMessage deserializer for serializing the message object + """ + mpp_message = MPPMessage.from_bytes(data) + return communicator.Message(mpp_message.timestamp, + mpp_message.next_timestamp, + mpp_message.data) + @dataclass class SnapshotMetadata: diff --git a/libmuscle/python/libmuscle/test/test_snapshot.py b/libmuscle/python/libmuscle/test/test_snapshot.py new file mode 100644 index 00000000..b238df44 --- /dev/null +++ b/libmuscle/python/libmuscle/test/test_snapshot.py @@ -0,0 +1,50 @@ +import pytest + +from libmuscle.communicator import Message +from libmuscle.snapshot import Snapshot, MsgPackSnapshot, SnapshotMetadata + + +@pytest.fixture +def snapshot() -> Snapshot: + triggers = ["test triggers"] + wallclocktime = 15.3 + port_message_counts = {'in': [1], 'out': [4], 'muscle_settings_in': [0]} + is_final = True + message = Message(1.2, None, "test_data") + snapshot = MsgPackSnapshot( + triggers, wallclocktime, port_message_counts, is_final, message) + assert snapshot.triggers == triggers + assert snapshot.wallclocktime == wallclocktime + assert snapshot.port_message_counts == port_message_counts + assert snapshot.is_final_snapshot == is_final + assert snapshot.message == message + return snapshot + + +def test_snapshot(snapshot: Snapshot) -> None: + assert isinstance(snapshot, Snapshot) + + binary_snapshot = snapshot.to_bytes() + assert isinstance(binary_snapshot, bytes) + + snapshot2 = MsgPackSnapshot.from_bytes(binary_snapshot) + + assert snapshot2.triggers == snapshot.triggers + assert snapshot2.wallclocktime == snapshot.wallclocktime + assert snapshot2.port_message_counts == snapshot.port_message_counts + assert snapshot2.is_final_snapshot == snapshot.is_final_snapshot + assert snapshot2.message.timestamp == snapshot.message.timestamp + assert snapshot2.message.next_timestamp == snapshot.message.next_timestamp + assert snapshot2.message.data == snapshot.message.data + + +def test_snapshot_metadata(snapshot: Snapshot) -> None: + metadata = SnapshotMetadata.from_snapshot(snapshot, "test") + + assert metadata.triggers == snapshot.triggers + assert metadata.wallclocktime == snapshot.wallclocktime + assert metadata.port_message_counts == snapshot.port_message_counts + assert metadata.is_final_snapshot == snapshot.is_final_snapshot + assert metadata.timestamp == snapshot.message.timestamp + assert metadata.next_timestamp == snapshot.message.next_timestamp + assert metadata.snapshot_filename == "test" From 773d1284f67eb512be0ab3b902806fbd0d9aa89c Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Wed, 31 Aug 2022 10:45:48 +0200 Subject: [PATCH 026/183] Add tests for SnapshotManager & fix bugs --- .../python/libmuscle/checkpoint_triggers.py | 3 + .../python/libmuscle/snapshot_manager.py | 11 +- .../libmuscle/test/test_snapshot_manager.py | 100 ++++++++++++++++++ 3 files changed, 111 insertions(+), 3 deletions(-) create mode 100644 libmuscle/python/libmuscle/test/test_snapshot_manager.py diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py index 8bf343fb..b3715525 100644 --- a/libmuscle/python/libmuscle/checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/checkpoint_triggers.py @@ -302,6 +302,9 @@ def update_checkpoints(self, timestamp: float, self._prevsim = simulationtime self._nextsim = self._sim.next_checkpoint(simulationtime) + # this method is also called during resume, after which we no longer + # consider the simulationtime as reset + self._sim_reset = False self._should_have_saved = False self._saved_final_checkpoint = final diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py index afce7908..e17cb460 100644 --- a/libmuscle/python/libmuscle/snapshot_manager.py +++ b/libmuscle/python/libmuscle/snapshot_manager.py @@ -67,6 +67,11 @@ def registered(self, snapshot = cast(Snapshot, self._resume_from_snapshot) self._communicator.restore_message_counts( snapshot.port_message_counts) + if self._trigger: + self._trigger.update_checkpoints( + snapshot.message.timestamp, + snapshot.message.next_timestamp, + snapshot.is_final_snapshot) def reuse_instance(self, max_f_init_next_timestamp: Optional[float], @@ -135,8 +140,8 @@ def __save_snapshot(self, msg: Message, final: bool) -> None: final: True iff called from save_final_snapshot """ if self._trigger is None: - _logger.warning('Saving a snapshot but no checkpoints requested' - ' by the workflow.') + _logger.info('Saving a snapshot but no checkpoints requested' + ' by the workflow.') triggers = [] wallclocktime = 0.0 else: @@ -167,7 +172,7 @@ def __load_snapshot(self, snapshot_location: Path) -> None: ' and can be read.') # TODO: encapsulate I/O errors? - with snapshot_location.open("rb") as snapshot_file: + with snapshot_location.open('rb') as snapshot_file: version = snapshot_file.read(1) data = snapshot_file.read() diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py new file mode 100644 index 00000000..b5a8edde --- /dev/null +++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py @@ -0,0 +1,100 @@ +from datetime import datetime, timezone +import logging +from pathlib import Path +from unittest.mock import MagicMock + +import pytest +from ymmsl import Reference, Checkpoints, CheckpointRules + +from libmuscle.communicator import Message +from libmuscle.snapshot import SnapshotMetadata +from libmuscle.snapshot_manager import SnapshotManager + + +def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path + ) -> None: + manager = MagicMock() + communicator = MagicMock() + communicator.get_message_counts.return_value = {} + snapshot_manager = SnapshotManager(Reference('test'), manager, communicator) + + snapshot_manager.registered(datetime.now(timezone.utc), Checkpoints(), None) + + snapshot_manager.reuse_instance(None, Path(tmp_path)) + assert not snapshot_manager.resuming() + assert not snapshot_manager.should_save_snapshot(1, None) + assert not snapshot_manager.should_save_snapshot(5000, None) + assert not snapshot_manager.should_save_final_snapshot(1000) + + with caplog.at_level(logging.INFO, 'libmuscle.snapshot_manager'): + snapshot_manager.save_snapshot(Message(1.0, None, None)) + assert caplog.records[0].levelname == "INFO" + assert "no checkpoints" in caplog.records[0].message + + +def test_save_load_checkpoint(tmp_path: Path) -> None: + manager = MagicMock() + communicator = MagicMock() + port_message_counts = {'in': [1], 'out': [2], 'muscle_settings_in': [0]} + communicator.get_message_counts.return_value = port_message_counts + + instance_id = Reference('test[1]') + snapshot_manager = SnapshotManager(instance_id, manager, communicator) + + checkpoints = Checkpoints(simulationtime=CheckpointRules(every=1)) + snapshot_manager.registered(datetime.now(timezone.utc), checkpoints, None) + + snapshot_manager.reuse_instance(None, tmp_path) + with pytest.raises(RuntimeError): + snapshot_manager.load_snapshot() + + assert not snapshot_manager.resuming() + assert snapshot_manager.should_save_snapshot(0.2, 0.4) + snapshot_manager.save_snapshot(Message(0.2, 0.4, 'test data')) + + communicator.get_message_counts.assert_called_with() + manager.submit_snapshot_metadata.assert_called() + metadata = manager.submit_snapshot_metadata.call_args.args[0] + assert isinstance(metadata, SnapshotMetadata) + assert metadata.triggers + assert metadata.wallclocktime > 0.0 + assert metadata.timestamp == 0.2 + assert metadata.next_timestamp == 0.4 + assert metadata.port_message_counts == port_message_counts + assert not metadata.is_final_snapshot + fpath = Path(metadata.snapshot_filename) + assert fpath.parent == tmp_path + assert fpath.name == 'test-1_1.pack' + + snapshot_manager2 = SnapshotManager(instance_id, manager, communicator) + + snapshot_manager2.registered(datetime.now(timezone.utc), checkpoints, fpath) + communicator.restore_message_counts.assert_called_with(port_message_counts) + + assert snapshot_manager2.resuming() + snapshot_manager2.reuse_instance(None, tmp_path) + assert snapshot_manager2.resuming() + msg = snapshot_manager2.load_snapshot() + assert msg.timestamp == 0.2 + assert msg.next_timestamp == 0.4 + assert msg.data == 'test data' + + assert not snapshot_manager2.should_save_snapshot(0.4, 0.6) + assert snapshot_manager2.should_save_final_snapshot(0.6) + snapshot_manager2.save_final_snapshot(Message(0.6, None, 'test data2')) + + metadata = manager.submit_snapshot_metadata.call_args.args[0] + assert isinstance(metadata, SnapshotMetadata) + assert metadata.triggers + assert metadata.wallclocktime > 0.0 + assert metadata.timestamp == 0.6 + assert metadata.next_timestamp is None + assert metadata.port_message_counts == port_message_counts + assert metadata.is_final_snapshot + fpath = Path(metadata.snapshot_filename) + assert fpath.parent == tmp_path + assert fpath.name == 'test-1_2.pack' + + assert snapshot_manager2.resuming() + snapshot_manager2.reuse_instance(None, tmp_path) + assert not snapshot_manager2.resuming() From cbb6b4f4378fd718cd72e63678ca260f44a4bd24 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Wed, 31 Aug 2022 10:46:23 +0200 Subject: [PATCH 027/183] Allow saving snapshot messages with Settings --- libmuscle/python/libmuscle/snapshot.py | 8 +++++-- .../python/libmuscle/test/test_snapshot.py | 21 +++++++++++++++---- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/libmuscle/python/libmuscle/snapshot.py b/libmuscle/python/libmuscle/snapshot.py index 560e1129..324ab76f 100644 --- a/libmuscle/python/libmuscle/snapshot.py +++ b/libmuscle/python/libmuscle/snapshot.py @@ -77,9 +77,12 @@ def to_bytes(self) -> bytes: def message_to_bytes(message: 'communicator.Message') -> bytes: """Use MPPMessage serializer for serializing the message object """ + settings = Settings() + if message.settings is not None: + settings = message.settings return MPPMessage(Reference('_'), Reference('_'), None, message.timestamp, message.next_timestamp, - Settings(), 0, message.data).encoded() + settings, 0, message.data).encoded() @staticmethod def bytes_to_message(data: bytes) -> 'communicator.Message': @@ -88,7 +91,8 @@ def bytes_to_message(data: bytes) -> 'communicator.Message': mpp_message = MPPMessage.from_bytes(data) return communicator.Message(mpp_message.timestamp, mpp_message.next_timestamp, - mpp_message.data) + mpp_message.data, + mpp_message.settings_overlay) @dataclass diff --git a/libmuscle/python/libmuscle/test/test_snapshot.py b/libmuscle/python/libmuscle/test/test_snapshot.py index b238df44..82c0d6a5 100644 --- a/libmuscle/python/libmuscle/test/test_snapshot.py +++ b/libmuscle/python/libmuscle/test/test_snapshot.py @@ -1,4 +1,5 @@ import pytest +from ymmsl import Settings from libmuscle.communicator import Message from libmuscle.snapshot import Snapshot, MsgPackSnapshot, SnapshotMetadata @@ -6,11 +7,11 @@ @pytest.fixture def snapshot() -> Snapshot: - triggers = ["test triggers"] + triggers = ['test triggers'] wallclocktime = 15.3 port_message_counts = {'in': [1], 'out': [4], 'muscle_settings_in': [0]} is_final = True - message = Message(1.2, None, "test_data") + message = Message(1.2, None, 'test_data') snapshot = MsgPackSnapshot( triggers, wallclocktime, port_message_counts, is_final, message) assert snapshot.triggers == triggers @@ -39,7 +40,7 @@ def test_snapshot(snapshot: Snapshot) -> None: def test_snapshot_metadata(snapshot: Snapshot) -> None: - metadata = SnapshotMetadata.from_snapshot(snapshot, "test") + metadata = SnapshotMetadata.from_snapshot(snapshot, 'test') assert metadata.triggers == snapshot.triggers assert metadata.wallclocktime == snapshot.wallclocktime @@ -47,4 +48,16 @@ def test_snapshot_metadata(snapshot: Snapshot) -> None: assert metadata.is_final_snapshot == snapshot.is_final_snapshot assert metadata.timestamp == snapshot.message.timestamp assert metadata.next_timestamp == snapshot.message.next_timestamp - assert metadata.snapshot_filename == "test" + assert metadata.snapshot_filename == 'test' + + +def test_message_with_settings() -> None: + message = Message(1.0, 2.0, 'test_data', Settings({'setting': True})) + snapshot = MsgPackSnapshot([], 0, {}, False, message) + assert snapshot.message.settings.get('setting') is True + + binary_snapshot = snapshot.to_bytes() + assert isinstance(binary_snapshot, bytes) + + snapshot2 = MsgPackSnapshot.from_bytes(binary_snapshot) + assert snapshot2.message.settings.get('setting') is True From 1beb84798cbd3e5f708f38e66b0c063c69cd8977 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Wed, 31 Aug 2022 11:01:14 +0200 Subject: [PATCH 028/183] Rename wallclocktime/simulationtime to *_time See related commit in ymmsl-python: https://github.com/multiscale/ymmsl-python/commit/2b0401969a8b7c8ae807f388aee2320c2c8b57b4 --- .../python/libmuscle/checkpoint_triggers.py | 32 +++++++++---------- libmuscle/python/libmuscle/snapshot.py | 12 +++---- .../python/libmuscle/snapshot_manager.py | 8 ++--- .../test/test_checkpoint_triggers.py | 8 ++--- .../python/libmuscle/test/test_snapshot.py | 10 +++--- .../libmuscle/test/test_snapshot_manager.py | 6 ++-- 6 files changed, 38 insertions(+), 38 deletions(-) diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py index b3715525..dbb8fcb4 100644 --- a/libmuscle/python/libmuscle/checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/checkpoint_triggers.py @@ -187,11 +187,11 @@ def __init__(self, utc_reference: datetime, checkpoints: Checkpoints ) -> None: self._monotonic_reference = _utc_to_monotonic(utc_reference) - self._wall = CombinedCheckpointTriggers(checkpoints.wallclocktime) + self._wall = CombinedCheckpointTriggers(checkpoints.wallclock_time) self._prevwall = 0.0 self._nextwall = self._wall.next_checkpoint(0.0) # type: Optional[float] - self._sim = CombinedCheckpointTriggers(checkpoints.simulationtime) + self._sim = CombinedCheckpointTriggers(checkpoints.simulation_time) self._prevsim = None # type: Optional[float] self._nextsim = None # type: Optional[float] self._sim_reset = True @@ -206,7 +206,7 @@ def __init__(self, utc_reference: datetime, checkpoints: Checkpoints self._saved_final_checkpoint = False def elapsed_walltime(self) -> float: - """Returns elapsed wallclocktime in seconds. + """Returns elapsed wallclock_time in seconds. """ return time.monotonic() - self._monotonic_reference @@ -293,17 +293,17 @@ def update_checkpoints(self, timestamp: float, self._nextwall = self._wall.next_checkpoint(self._prevwall) if final and self._max_f_init_next_timestamp is not None: - simulationtime = self._max_f_init_next_timestamp + simulation_time = self._max_f_init_next_timestamp else: if next_timestamp is None: - simulationtime = timestamp + simulation_time = timestamp else: - simulationtime = next_timestamp - self._prevsim = simulationtime - self._nextsim = self._sim.next_checkpoint(simulationtime) + simulation_time = next_timestamp + self._prevsim = simulation_time + self._nextsim = self._sim.next_checkpoint(simulation_time) # this method is also called during resume, after which we no longer - # consider the simulationtime as reset + # consider the simulation_time as reset self._sim_reset = False self._should_have_saved = False self._saved_final_checkpoint = final @@ -315,29 +315,29 @@ def get_triggers(self) -> List[str]: self._last_triggers = [] return triggers - def __should_save(self, walltime: float, simulationtime: float) -> bool: + def __should_save(self, walltime: float, simulation_time: float) -> bool: """Check if a checkpoint should be taken Args: walltime: current wallclock time (elapsed since reference) - simulationtime: current/next timestamp as reported by the instance + simulation_time: current/next timestamp as reported by the instance """ if self._sim_reset: # we cannot make assumptions about the start time of a simulation, # a t=-1000 could make sense if t represents years since CE # and we should not disallow checkpointing for negative t - previous = self._sim.previous_checkpoint(simulationtime) + previous = self._sim.previous_checkpoint(simulation_time) if previous is not None: # there is a checkpoint rule before the current moment, assume # we should have taken a snapshot back then self._nextsim = previous else: - self._nextsim = self._sim.next_checkpoint(simulationtime) + self._nextsim = self._sim.next_checkpoint(simulation_time) self._sim_reset = False self._last_triggers = [] if self._nextwall is not None and walltime >= self._nextwall: - self._last_triggers.append(f"wallclocktime >= {self._nextwall}") - if self._nextsim is not None and simulationtime >= self._nextsim: - self._last_triggers.append(f"simulationtime >= {self._nextsim}") + self._last_triggers.append(f"wallclock_time >= {self._nextwall}") + if self._nextsim is not None and simulation_time >= self._nextsim: + self._last_triggers.append(f"simulation_time >= {self._nextsim}") return bool(self._last_triggers) diff --git a/libmuscle/python/libmuscle/snapshot.py b/libmuscle/python/libmuscle/snapshot.py index 324ab76f..93ed9307 100644 --- a/libmuscle/python/libmuscle/snapshot.py +++ b/libmuscle/python/libmuscle/snapshot.py @@ -18,12 +18,12 @@ class Snapshot(ABC): def __init__(self, triggers: List[str], - wallclocktime: float, + wallclock_time: float, port_message_counts: Dict[str, List[int]], is_final_snapshot: bool, message: 'communicator.Message') -> None: self.triggers = triggers - self.wallclocktime = wallclocktime + self.wallclock_time = wallclock_time self.port_message_counts = port_message_counts self.is_final_snapshot = is_final_snapshot self.message = message @@ -59,7 +59,7 @@ class MsgPackSnapshot(Snapshot): def from_bytes(cls, data: bytes) -> 'Snapshot': dct = msgpack.loads(data) return cls(dct['triggers'], - dct['wallclocktime'], + dct['wallclock_time'], dct['port_message_counts'], dct['is_final_snapshot'], cls.bytes_to_message(dct['message'])) @@ -67,7 +67,7 @@ def from_bytes(cls, data: bytes) -> 'Snapshot': def to_bytes(self) -> bytes: return cast(bytes, msgpack.dumps({ 'triggers': self.triggers, - 'wallclocktime': self.wallclocktime, + 'wallclock_time': self.wallclock_time, 'port_message_counts': self.port_message_counts, 'is_final_snapshot': self.is_final_snapshot, 'message': self.message_to_bytes(self.message) @@ -100,7 +100,7 @@ class SnapshotMetadata: """Metadata of a snapshot for sending to the muscle_manager. """ triggers: List[str] - wallclocktime: float + wallclock_time: float timestamp: float next_timestamp: Optional[float] port_message_counts: Dict[str, List[int]] @@ -115,7 +115,7 @@ def from_snapshot(snapshot: Snapshot, snapshot_filename: str """ return SnapshotMetadata( snapshot.triggers, - snapshot.wallclocktime, + snapshot.wallclock_time, snapshot.message.timestamp, snapshot.message.next_timestamp, snapshot.port_message_counts, diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py index e17cb460..f8477637 100644 --- a/libmuscle/python/libmuscle/snapshot_manager.py +++ b/libmuscle/python/libmuscle/snapshot_manager.py @@ -56,7 +56,7 @@ def registered(self, should resume from a previous snapshot. Args: - utc_reference: datetime (in UTC timezone) indicating wallclocktime=0 + utc_reference: datetime (in UTC) indicating wallclock_time=0 checkpoints: requested workflow checkpoints resume: previous snapshot to resume from (or None if not resuming) """ @@ -143,14 +143,14 @@ def __save_snapshot(self, msg: Message, final: bool) -> None: _logger.info('Saving a snapshot but no checkpoints requested' ' by the workflow.') triggers = [] - wallclocktime = 0.0 + wallclock_time = 0.0 else: triggers = self._trigger.get_triggers() - wallclocktime = self._trigger.elapsed_walltime() + wallclock_time = self._trigger.elapsed_walltime() port_message_counts = self._communicator.get_message_counts() snapshot = MsgPackSnapshot( - triggers, wallclocktime, port_message_counts, final, msg) + triggers, wallclock_time, port_message_counts, final, msg) path = self.__store_snapshot(snapshot) metadata = SnapshotMetadata.from_snapshot(snapshot, str(path)) diff --git a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py index 17afd0b1..41577ad3 100644 --- a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py @@ -150,8 +150,8 @@ def test_trigger_manager_reference_time(): def test_trigger_manager(): reference = datetime.now(timezone.utc) trigger_manager = TriggerManager(reference, Checkpoints( - wallclocktime=CheckpointRules(at=[1e-12]), - simulationtime=CheckpointRules(at=[1, 3, 5]))) + wallclock_time=CheckpointRules(at=[1e-12]), + simulation_time=CheckpointRules(at=[1, 3, 5]))) trigger_manager.reuse_instance(7) @@ -159,7 +159,7 @@ def test_trigger_manager(): assert trigger_manager.should_save_snapshot(t, t_next) triggers = trigger_manager.get_triggers() assert len(triggers) == 1 - assert "wallclocktime" in triggers[0] + assert "wallclock_time" in triggers[0] with pytest.raises(RuntimeError): # did not call save in between trigger_manager.should_save_snapshot(t, t_next) trigger_manager.update_checkpoints(t, t_next, False) @@ -202,7 +202,7 @@ def test_trigger_manager_warnings(caplog: pytest.LogCaptureFixture, reference = datetime.now(timezone.utc) trigger_manager = TriggerManager(reference, Checkpoints( - simulationtime=CheckpointRules(at=[1, 3, 5]))) + simulation_time=CheckpointRules(at=[1, 3, 5]))) trigger_manager.reuse_instance(2) diff --git a/libmuscle/python/libmuscle/test/test_snapshot.py b/libmuscle/python/libmuscle/test/test_snapshot.py index 82c0d6a5..c959a226 100644 --- a/libmuscle/python/libmuscle/test/test_snapshot.py +++ b/libmuscle/python/libmuscle/test/test_snapshot.py @@ -8,14 +8,14 @@ @pytest.fixture def snapshot() -> Snapshot: triggers = ['test triggers'] - wallclocktime = 15.3 + wallclock_time = 15.3 port_message_counts = {'in': [1], 'out': [4], 'muscle_settings_in': [0]} is_final = True message = Message(1.2, None, 'test_data') snapshot = MsgPackSnapshot( - triggers, wallclocktime, port_message_counts, is_final, message) + triggers, wallclock_time, port_message_counts, is_final, message) assert snapshot.triggers == triggers - assert snapshot.wallclocktime == wallclocktime + assert snapshot.wallclock_time == wallclock_time assert snapshot.port_message_counts == port_message_counts assert snapshot.is_final_snapshot == is_final assert snapshot.message == message @@ -31,7 +31,7 @@ def test_snapshot(snapshot: Snapshot) -> None: snapshot2 = MsgPackSnapshot.from_bytes(binary_snapshot) assert snapshot2.triggers == snapshot.triggers - assert snapshot2.wallclocktime == snapshot.wallclocktime + assert snapshot2.wallclock_time == snapshot.wallclock_time assert snapshot2.port_message_counts == snapshot.port_message_counts assert snapshot2.is_final_snapshot == snapshot.is_final_snapshot assert snapshot2.message.timestamp == snapshot.message.timestamp @@ -43,7 +43,7 @@ def test_snapshot_metadata(snapshot: Snapshot) -> None: metadata = SnapshotMetadata.from_snapshot(snapshot, 'test') assert metadata.triggers == snapshot.triggers - assert metadata.wallclocktime == snapshot.wallclocktime + assert metadata.wallclock_time == snapshot.wallclock_time assert metadata.port_message_counts == snapshot.port_message_counts assert metadata.is_final_snapshot == snapshot.is_final_snapshot assert metadata.timestamp == snapshot.message.timestamp diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py index b5a8edde..7ac0972f 100644 --- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py +++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py @@ -41,7 +41,7 @@ def test_save_load_checkpoint(tmp_path: Path) -> None: instance_id = Reference('test[1]') snapshot_manager = SnapshotManager(instance_id, manager, communicator) - checkpoints = Checkpoints(simulationtime=CheckpointRules(every=1)) + checkpoints = Checkpoints(simulation_time=CheckpointRules(every=1)) snapshot_manager.registered(datetime.now(timezone.utc), checkpoints, None) snapshot_manager.reuse_instance(None, tmp_path) @@ -57,7 +57,7 @@ def test_save_load_checkpoint(tmp_path: Path) -> None: metadata = manager.submit_snapshot_metadata.call_args.args[0] assert isinstance(metadata, SnapshotMetadata) assert metadata.triggers - assert metadata.wallclocktime > 0.0 + assert metadata.wallclock_time > 0.0 assert metadata.timestamp == 0.2 assert metadata.next_timestamp == 0.4 assert metadata.port_message_counts == port_message_counts @@ -86,7 +86,7 @@ def test_save_load_checkpoint(tmp_path: Path) -> None: metadata = manager.submit_snapshot_metadata.call_args.args[0] assert isinstance(metadata, SnapshotMetadata) assert metadata.triggers - assert metadata.wallclocktime > 0.0 + assert metadata.wallclock_time > 0.0 assert metadata.timestamp == 0.6 assert metadata.next_timestamp is None assert metadata.port_message_counts == port_message_counts From 07f3b86bfb0c65d4f014d0c6ccd02562d31af044 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Wed, 31 Aug 2022 11:08:08 +0200 Subject: [PATCH 029/183] Add dataclasses backport as dependency for py3.6 --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index ade2fa38..b131fb1e 100644 --- a/setup.py +++ b/setup.py @@ -55,6 +55,7 @@ "numpy>=1.22,<=1.25; python_version>='3.8'", 'qcg-pilotjob==0.13.1', 'typing_extensions<4', + "dataclasses; python_version=='3.6'", 'ymmsl>=0.12.0,<0.13' # Also in CI, update there as well ], extras_require={ From b6b0ef52e6fb69d15b7f17345182497f6a751238 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Wed, 31 Aug 2022 11:13:40 +0200 Subject: [PATCH 030/183] types-dataclasses dependency in tox.ini for py3.6 --- tox.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/tox.ini b/tox.ini index 23fb19f3..717d5107 100644 --- a/tox.ini +++ b/tox.ini @@ -9,6 +9,7 @@ deps = pytest pytest-cov git+https://github.com/multiscale/ymmsl-python.git@feature/checkpointing#egg=ymmsl + types-dataclasses; python_version=='3.6' passenv = MUSCLE_TEST_PYTHON_ONLY From 65036e4ca8df199549e51882dcb5dcb35ef9ad6a Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Wed, 31 Aug 2022 11:22:16 +0200 Subject: [PATCH 031/183] Rewrite MagicMock.call_args.args (py3.8+ only) --- libmuscle/python/libmuscle/test/test_snapshot_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py index 7ac0972f..a8223e53 100644 --- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py +++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py @@ -54,7 +54,7 @@ def test_save_load_checkpoint(tmp_path: Path) -> None: communicator.get_message_counts.assert_called_with() manager.submit_snapshot_metadata.assert_called() - metadata = manager.submit_snapshot_metadata.call_args.args[0] + metadata = manager.submit_snapshot_metadata.call_args[0][0] assert isinstance(metadata, SnapshotMetadata) assert metadata.triggers assert metadata.wallclock_time > 0.0 @@ -83,7 +83,7 @@ def test_save_load_checkpoint(tmp_path: Path) -> None: assert snapshot_manager2.should_save_final_snapshot(0.6) snapshot_manager2.save_final_snapshot(Message(0.6, None, 'test data2')) - metadata = manager.submit_snapshot_metadata.call_args.args[0] + metadata = manager.submit_snapshot_metadata.call_args[0][0] assert isinstance(metadata, SnapshotMetadata) assert metadata.triggers assert metadata.wallclock_time > 0.0 From 98d11311fa7b8ca332c4b114c9102f0675073e02 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Wed, 31 Aug 2022 17:24:03 +0200 Subject: [PATCH 032/183] Add checkpointing API and to Instance --- libmuscle/python/libmuscle/instance.py | 192 +++++++++++++++++- libmuscle/python/libmuscle/mmp_client.py | 11 +- .../python/libmuscle/snapshot_manager.py | 10 +- .../python/libmuscle/test/test_instance.py | 9 +- .../libmuscle/test/test_snapshot_manager.py | 9 +- 5 files changed, 214 insertions(+), 17 deletions(-) diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index 513018d6..f91bae55 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -1,13 +1,15 @@ from copy import copy +from datetime import datetime import logging import os +from pathlib import Path import sys from typing import cast, Dict, List, Optional, Tuple, overload # TODO: import from typing module when dropping support for python 3.7 from typing_extensions import Literal from ymmsl import (Identifier, Operator, SettingValue, Port, Reference, - Settings) + Settings, Checkpoints) from libmuscle.communicator import Communicator, Message from libmuscle.settings_manager import SettingsManager @@ -17,6 +19,7 @@ from libmuscle.mmp_client import MMPClient from libmuscle.profiler import Profiler from libmuscle.profiling import ProfileEventType +from libmuscle.snapshot_manager import SnapshotManager from libmuscle.util import extract_log_file_location @@ -65,13 +68,20 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None self._settings_manager = SettingsManager() """Settings for this instance.""" + self._snapshot_manager = SnapshotManager( + self._instance_name(), self.__manager, self._communicator) + """Keeps track of checkpointing and snapshots""" + self._first_run = True """Keeps track of whether this is the first reuse run.""" self._f_init_cache = dict() # type: _FInitCacheType - self._register() + checkpoint_info = self._register() self._connect() + # Note: SnapshotManager.set_checkpoint_info needs to have the ports + # initialized so it comes after self._connect() + self._snapshot_manager.set_checkpoint_info(*checkpoint_info) self._set_local_log_level() self._set_remote_log_level() @@ -107,6 +117,17 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool: and everything will be fine. If it turns out that you did need to specify False, MUSCLE3 will tell you about it in an error message and you can add it still. + + Raises: + RuntimeError: + When implementing the checkpointing API, but libmuscle detected + incorrect API calls. The description of the RuntimeError + indicates which calls are incorrect or missing. For more + information see the checkpointing API documentation in + :meth:`resuming`, :meth:`load_snapshot`, + :meth:`should_save_snapshot`, :meth:`save_snapshot`, + :meth:`should_save_final_snapshot` and + :meth:`save_final_snapshot`, or the checkpointing tutorial. """ do_reuse = self.__receive_settings() @@ -132,6 +153,22 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool: if isinstance(message.data, ClosePort): do_reuse = False + max_f_init_next_timestamp = max( + (msg.next_timestamp + for msg in self._f_init_cache.values() + if msg.next_timestamp is not None), + default=None) + # Note: muscle_snapshot_directory setting is provided by muscle_manager + # when checkpointing is enabled for this run. When checkpointing is not + # enabled, it might not exist and a KeyError is raised. + try: + snapshot_dir = self.get_setting('muscle_snapshot_directory', 'str') + snapshot_path = Path(cast(str, snapshot_dir)) + except KeyError: + snapshot_path = None + self._snapshot_manager.reuse_instance( + max_f_init_next_timestamp, snapshot_path) + if not do_reuse: self.__close_ports() self._communicator.shutdown() @@ -381,16 +418,161 @@ def receive_with_settings( """ return self.__receive_message(port_name, slot, default, True) - def _register(self) -> None: + def resuming(self) -> bool: + """Check if this instance is resuming from a snapshot. + + Must be used by submodels that implement the checkpointing API. You'll + get a RuntimeError when not calling this method in an iteration of the + reuse loop. + + This method returns True for the first iteration of the reuse loop after + resuming from a previously taken snapshot. When resuming from a + snapshot, the submodel must load its state from the snapshot as returned + by :meth:`load_snapshot` and the F_INIT step must be skipped. + + Returns: + True iff the submodel must resume from a snapshot instead of the + usual F_INIT step during this iteration of the reuse loop. + """ + return self._snapshot_manager.resuming() + + def load_snapshot(self) -> Message: + """Load a snapshot. + + Must only be called when :meth:`resuming` returns True. + + Returns: + Message object containing the state as saved in a previous run + through :meth:`save_snapshot` or :meth:`save_final_snapshot` + + Raises: + RuntimeError: if not resuming from a snapshot. + """ + return self._snapshot_manager.load_snapshot() + + def should_save_snapshot( + self, timestamp: float, next_timestamp: Optional[float]) -> bool: + """Check if a snapshot should be saved inside a time-integration loop. + + This method checks if a snapshot should be saved right now, based on the + provided timestamps and passed wallclock time. + + When the next timestamp is provided, this value will be used to + determine if a checkpoint will be passed between now and the next time + step. A submodel should always provide the next timestamp if available, + since this is the most reliable way to get consistent snapshots across + all submodels in the run. + + When a submodel cannot provide the next timestamp, a best efford is made + to get consistent snapshots (based on the current timestamp). See the + checkpointing tutorial for more information. + + When this method returns True, the submodel must also save a snapshot + through :meth:`save_snapshot`. A RuntimeError will be generated when not + doing so. + + See also :meth:`should_save_final_snapshot` for the variant that must be + called at the end of a time-integration loop, or when a submodel does + not have a time-integration loop. + + Args: + timestamp: current timestamp of the submodel + next_timestamp: timestamp of the next iteration of the time + integration loop of the submodel or ``None`` if not available + + Returns: + True iff a snapshot should be taken by the submodel according to the + checkpoint rules provided in the ymmsl configuration. + """ + return self._snapshot_manager.should_save_snapshot( + timestamp, next_timestamp) + + def save_snapshot(self, message: Message) -> None: + """Save a snapshot inside a time-integration loop. + + Before saving a snapshot, you should check using + :meth:`should_save_snapshot` if a snapshot should be saved according to + the checkpoint rules specified in the ymmsl configuration. You should + use the same timestamp and next_timestamp in the provided Message object + as used to query `should_save_snapshot`. + + Although it is allowed to save a snapshot even when + :meth:`should_save_snapshot` returns False, you should avoid this: this + situation is not likely to lead to a consistent snapshot over all + submodels of the run (and therefore it is not useful to restart from). + It could also lead to a lot of snapshot files clogging your file system. + + See also :meth:`save_final_snapshot` for the variant that must be called + at the end of a time-integration loop, or when a submodel does not have + a time-integration loop. + + Args: + message: Message object that is saved as snapshot. The message + timestamp and next_timestamp attributes should be the same as + passed to :meth:`should_save_snapshot`. The data attribute can + be used to store the internal state of the submodel. + """ + return self._snapshot_manager.save_snapshot(message) + + def should_save_final_snapshot(self, timestamp: float) -> bool: + """Check if a snapshot should be saved before O_F. + + This method checks if a snapshot should be saved right now, based on the + provided timestamp and passed wallclock time. + + When this method returns True, the submodel must also save a snapshot + through :meth:`save_final_snapshot`. A RuntimeError will be generated + when not doing so. + + See also :meth:`should_save_snapshot` for the variant that may be called + inside of a time-integration loop of the submodel. + + Args: + timestamp: current timestamp of the submodel + + Returns: + True iff a final snapshot should be taken by the submodel according + to the checkpoint rules provided in the ymmsl configuration. + """ + return self._snapshot_manager.should_save_final_snapshot(timestamp) + + def save_final_snapshot(self, message: Message) -> None: + """Save a snapshot before O_F. + + Before saving a snapshot, you should check using + :meth:`should_save_final_snapshot` if a snapshot should be saved + according to the checkpoint rules specified in the ymmsl configuration. + You should use the same timestamp in the provided Message object as used + to query `should_save_final_snapshot`. + + Although it is allowed to save a snapshot even when + :meth:`should_save_final_snapshot` returns False, you should avoid this: + this situation is not likely to lead to a consistent snapshot over all + submodels of the run (and therefore it is not useful to restart from). + It could also lead to a lot of snapshot files clogging your file system. + + See also :meth:`save_snapshot` for the variant that may be called inside + of a time-integration loop of the submodel. + + Args: + message: Message object that is saved as snapshot. The message + timestamp should be the same as passed to + :meth:`should_save_snapshot`. The data attribute can be used to + store the internal state of the submodel. + """ + return self._snapshot_manager.save_final_snapshot(message) + + def _register(self) -> Tuple[datetime, Checkpoints, Optional[Path]]: """Register this instance with the manager. """ register_event = self._profiler.start(ProfileEventType.REGISTER) locations = self._communicator.get_locations() port_list = self.__list_declared_ports() - self.__manager.register_instance(self._instance_name(), locations, - port_list) + checkpoint_info = self.__manager.register_instance( + self._instance_name(), locations, port_list) register_event.stop() _logger.info('Registered with the manager') + return checkpoint_info def _connect(self) -> None: """Connect this instance to the given peers / conduits. diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py index 6771b1dd..de6a6897 100644 --- a/libmuscle/python/libmuscle/mmp_client.py +++ b/libmuscle/python/libmuscle/mmp_client.py @@ -1,9 +1,11 @@ +from datetime import datetime +from pathlib import Path from random import uniform from time import perf_counter, sleep -from typing import Any, Dict, Iterable, List, Tuple +from typing import Any, Dict, Iterable, List, Optional, Tuple import msgpack -from ymmsl import Conduit, Operator, Port, Reference, Settings +from ymmsl import Conduit, Operator, Port, Reference, Settings, Checkpoints from libmuscle.mcp.protocol import RequestType, ResponseType from libmuscle.mcp.tcp_transport_client import TcpTransportClient @@ -109,7 +111,8 @@ def get_settings(self) -> Settings: return Settings(response[1]) def register_instance(self, name: Reference, locations: List[str], - ports: List[Port]) -> None: + ports: List[Port] + ) -> Tuple[datetime, Checkpoints, Optional[Path]]: """Register a component instance with the manager. Args: @@ -126,6 +129,8 @@ def register_instance(self, name: Reference, locations: List[str], if len(response) > 1: raise RuntimeError( f'Error registering instance: {response[1]}') + # TODO + return (datetime.now(), Checkpoints(), None) def request_peers( self, name: Reference) -> Tuple[ diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py index f8477637..cd6f9959 100644 --- a/libmuscle/python/libmuscle/snapshot_manager.py +++ b/libmuscle/python/libmuscle/snapshot_manager.py @@ -46,10 +46,10 @@ def __init__(self, self._snapshot_directory = None # type: Optional[Path] self._next_snapshot_num = 1 - def registered(self, - utc_reference: datetime, - checkpoints: Checkpoints, - resume: Optional[Path]) -> None: + def set_checkpoint_info(self, + utc_reference: datetime, + checkpoints: Checkpoints, + resume: Optional[Path]) -> None: """Callback after registering with the manager. Provide the snapshot manager with info on workflow checkpoints and if we @@ -75,7 +75,7 @@ def registered(self, def reuse_instance(self, max_f_init_next_timestamp: Optional[float], - snapshot_directory: Path, + snapshot_directory: Optional[Path], ) -> None: """Callback on Instance.reuse_instance diff --git a/libmuscle/python/libmuscle/test/test_instance.py b/libmuscle/python/libmuscle/test/test_instance.py index 99d8d37b..e8c7f9b0 100644 --- a/libmuscle/python/libmuscle/test/test_instance.py +++ b/libmuscle/python/libmuscle/test/test_instance.py @@ -1,9 +1,10 @@ +from datetime import datetime, timezone import sys from typing import Generator from unittest.mock import MagicMock, patch import pytest -from ymmsl import Operator, Reference, Settings +from ymmsl import Operator, Reference, Settings, Checkpoints from libmuscle.communicator import Message from libmuscle.instance import Instance @@ -48,6 +49,8 @@ def instance(sys_argv_instance): mmp_client_object = MagicMock() mmp_client_object.request_peers.return_value = (None, None, None) + checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None) + mmp_client_object.register_instance.return_value = checkpoint_info mmp_client.return_value = mmp_client_object instance = Instance({ @@ -64,6 +67,8 @@ def instance2(sys_argv_instance): patch('libmuscle.instance.Communicator'): mmp_client_object = MagicMock() mmp_client_object.request_peers.return_value = (None, None, None) + checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None) + mmp_client_object.register_instance.return_value = checkpoint_info mmp_client.return_value = mmp_client_object instance = Instance({ Operator.F_INIT: ['in[]'], @@ -77,6 +82,8 @@ def test_create_instance( patch('libmuscle.instance.Communicator') as comm_type: mmp_client_object = MagicMock() mmp_client_object.request_peers.return_value = (None, None, None) + checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None) + mmp_client_object.register_instance.return_value = checkpoint_info mmp_client.return_value = mmp_client_object ports = { Operator.F_INIT: ['in'], diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py index a8223e53..8b86ff7a 100644 --- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py +++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py @@ -18,7 +18,8 @@ def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path communicator.get_message_counts.return_value = {} snapshot_manager = SnapshotManager(Reference('test'), manager, communicator) - snapshot_manager.registered(datetime.now(timezone.utc), Checkpoints(), None) + snapshot_manager.set_checkpoint_info( + datetime.now(timezone.utc), Checkpoints(), None) snapshot_manager.reuse_instance(None, Path(tmp_path)) assert not snapshot_manager.resuming() @@ -42,7 +43,8 @@ def test_save_load_checkpoint(tmp_path: Path) -> None: snapshot_manager = SnapshotManager(instance_id, manager, communicator) checkpoints = Checkpoints(simulation_time=CheckpointRules(every=1)) - snapshot_manager.registered(datetime.now(timezone.utc), checkpoints, None) + snapshot_manager.set_checkpoint_info( + datetime.now(timezone.utc), checkpoints, None) snapshot_manager.reuse_instance(None, tmp_path) with pytest.raises(RuntimeError): @@ -68,7 +70,8 @@ def test_save_load_checkpoint(tmp_path: Path) -> None: snapshot_manager2 = SnapshotManager(instance_id, manager, communicator) - snapshot_manager2.registered(datetime.now(timezone.utc), checkpoints, fpath) + snapshot_manager2.set_checkpoint_info( + datetime.now(timezone.utc), checkpoints, fpath) communicator.restore_message_counts.assert_called_with(port_message_counts) assert snapshot_manager2.resuming() From ee12e5c16b67c75407a409f4055200470dcef3d0 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 1 Sep 2022 15:52:03 +0200 Subject: [PATCH 033/183] Refactoring due to ymmsl update See also https://github.com/multiscale/ymmsl-python/commit/8e6e7631c6b7f9eab26c3c730f68bb83b7752332 --- .../python/libmuscle/checkpoint_triggers.py | 48 ++++++++++--------- .../test/test_checkpoint_triggers.py | 24 +++++----- .../libmuscle/test/test_snapshot_manager.py | 4 +- 3 files changed, 40 insertions(+), 36 deletions(-) diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py index dbb8fcb4..6e4d644e 100644 --- a/libmuscle/python/libmuscle/checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/checkpoint_triggers.py @@ -5,7 +5,8 @@ import time from typing import List, Optional, Union -from ymmsl import CheckpointRange, CheckpointRules, Checkpoints +from ymmsl import ( + CheckpointRangeRule, CheckpointAtRule, CheckpointRule, Checkpoints) _logger = logging.getLogger(__name__) @@ -52,14 +53,16 @@ class AtCheckpointTrigger(CheckpointTrigger): This triggers at the specified times. """ - def __init__(self, at: List[Union[float, int]]) -> None: + def __init__(self, at_rules: List[CheckpointAtRule]) -> None: """Create an "at" checkpoint trigger Args: at: list of checkpoint moments """ - self._at = at - self._at.sort() # ymmsl already sorts, but just to be sure + self._at = [] + for at_rule in at_rules: + self._at.extend(at_rule.at) + self._at.sort() def next_checkpoint(self, cur_time: float) -> Optional[float]: if cur_time >= self._at[-1]: @@ -92,7 +95,7 @@ class RangeCheckpointTrigger(CheckpointTrigger): omitted, and is handled by this class as well """ - def __init__(self, range: CheckpointRange) -> None: + def __init__(self, range: CheckpointRangeRule) -> None: """Create a range of checkpoints Args: @@ -100,12 +103,12 @@ def __init__(self, range: CheckpointRange) -> None: """ self._start = range.start self._stop = range.stop - self._step = range.step + self._every = range.every self._last = None # type: Union[int, float, None] if self._stop is not None: start = 0 if self._start is None else self._start diff = self._stop - start - self._last = start + (diff // self._step) * self._step + self._last = start + (diff // self._every) * self._every def next_checkpoint(self, cur_time: float) -> Optional[float]: if self._start is not None and cur_time < self._start: @@ -114,7 +117,7 @@ def next_checkpoint(self, cur_time: float) -> Optional[float]: return None start = 0 if self._start is None else self._start diff = cur_time - start - return float(start + (diff // self._step + 1) * self._step) + return float(start + (diff // self._every + 1) * self._every) def previous_checkpoint(self, cur_time: float) -> Optional[float]: if self._start is not None and cur_time < self._start: @@ -123,30 +126,31 @@ def previous_checkpoint(self, cur_time: float) -> Optional[float]: return float(self._last) start = 0 if self._start is None else self._start diff = cur_time - start - return float(start + (diff // self._step) * self._step) + return float(start + (diff // self._every) * self._every) class CombinedCheckpointTriggers(CheckpointTrigger): """Checkpoint trigger based on a combination of "every", "at" and "ranges" """ - def __init__(self, checkpoint_rules: Optional[CheckpointRules]) -> None: + def __init__(self, checkpoint_rules: List[CheckpointRule]) -> None: """Create a new combined checkpoint trigger from the given rules Args: - checkpoint_rules: checkpoint rules (from ymmsl) defining "every", - "at", and/or "ranges" rules + checkpoint_rules: checkpoint rules (from ymmsl) """ - self._triggers = [] # type: List[CheckpointTrigger] - if checkpoint_rules is None: - return - if checkpoint_rules.every is not None: - cp_range = CheckpointRange(step=checkpoint_rules.every) - self._triggers.append(RangeCheckpointTrigger(cp_range)) - if checkpoint_rules.at: - self._triggers.append(AtCheckpointTrigger(checkpoint_rules.at)) - for cp_range in checkpoint_rules.ranges: - self._triggers.append(RangeCheckpointTrigger(cp_range)) + self._triggers = [] # type: List[CheckpointTrigger] + at_rules = [] # type: List[CheckpointAtRule] + for rule in checkpoint_rules: + if isinstance(rule, CheckpointAtRule): + if rule.at: + at_rules.append(rule) + elif isinstance(rule, CheckpointRangeRule): + self._triggers.append(RangeCheckpointTrigger(rule)) + else: + raise RuntimeError('Unknown checkpoint rule') + if at_rules: + self._triggers.append(AtCheckpointTrigger(at_rules)) def next_checkpoint(self, cur_time: float) -> Optional[float]: checkpoints = (trigger.next_checkpoint(cur_time) diff --git a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py index 41577ad3..baf0c2c1 100644 --- a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py @@ -2,7 +2,7 @@ import logging import time import pytest -from ymmsl import CheckpointRange, CheckpointRules, Checkpoints +from ymmsl import CheckpointRangeRule, CheckpointAtRule, Checkpoints from libmuscle.checkpoint_triggers import ( CombinedCheckpointTriggers, AtCheckpointTrigger, RangeCheckpointTrigger, @@ -10,7 +10,7 @@ def test_at_checkpoint_trigger(): - trigger = AtCheckpointTrigger([1, 3, 4, 4.5, 9]) + trigger = AtCheckpointTrigger([CheckpointAtRule([1, 3, 4, 4.5, 9])]) assert trigger.next_checkpoint(0) == 1 assert trigger.previous_checkpoint(0) is None @@ -39,7 +39,7 @@ def test_at_checkpoint_trigger(): def test_range_checkpoint_trigger(): - range = CheckpointRange(start=0, stop=20, step=1.2) + range = CheckpointRangeRule(start=0, stop=20, every=1.2) trigger = RangeCheckpointTrigger(range) assert trigger.next_checkpoint(-1) == 0 @@ -59,7 +59,7 @@ def test_range_checkpoint_trigger(): def test_range_checkpoint_trigger_default_stop(): - range = CheckpointRange(start=1, step=1.2) + range = CheckpointRangeRule(start=1, every=1.2) trigger = RangeCheckpointTrigger(range) assert trigger.next_checkpoint(-1.) == 1 @@ -73,7 +73,7 @@ def test_range_checkpoint_trigger_default_stop(): def test_range_checkpoint_trigger_default_start(): - range = CheckpointRange(step=1.2, stop=10) + range = CheckpointRangeRule(every=1.2, stop=10) trigger = RangeCheckpointTrigger(range) assert trigger.next_checkpoint(10) is None @@ -87,7 +87,7 @@ def test_range_checkpoint_trigger_default_start(): def test_combined_checkpoint_trigger_every_at(): - rules = CheckpointRules(every=10, at=[3, 7, 13, 17]) + rules = [CheckpointRangeRule(every=10), CheckpointAtRule([3, 7, 13, 17])] trigger = CombinedCheckpointTriggers(rules) assert trigger.next_checkpoint(-11.) == pytest.approx(-10) @@ -107,9 +107,9 @@ def test_combined_checkpoint_trigger_every_at(): def test_combined_checkpoint_trigger_at_ranges(): - rules = CheckpointRules(at=[3, 7, 13, 17], ranges=[ - CheckpointRange(start=0, step=5, stop=20), - CheckpointRange(start=20, step=20, stop=100)]) + rules = [CheckpointAtRule([3, 7, 13, 17]), + CheckpointRangeRule(start=0, every=5, stop=20), + CheckpointRangeRule(start=20, every=20, stop=100)] trigger = CombinedCheckpointTriggers(rules) assert trigger.next_checkpoint(-11.) == pytest.approx(0) @@ -150,8 +150,8 @@ def test_trigger_manager_reference_time(): def test_trigger_manager(): reference = datetime.now(timezone.utc) trigger_manager = TriggerManager(reference, Checkpoints( - wallclock_time=CheckpointRules(at=[1e-12]), - simulation_time=CheckpointRules(at=[1, 3, 5]))) + wallclock_time=[CheckpointAtRule([1e-12])], + simulation_time=[CheckpointAtRule([1, 3, 5])])) trigger_manager.reuse_instance(7) @@ -202,7 +202,7 @@ def test_trigger_manager_warnings(caplog: pytest.LogCaptureFixture, reference = datetime.now(timezone.utc) trigger_manager = TriggerManager(reference, Checkpoints( - simulation_time=CheckpointRules(at=[1, 3, 5]))) + simulation_time=[CheckpointAtRule([1, 3, 5])])) trigger_manager.reuse_instance(2) diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py index 8b86ff7a..d7d386c9 100644 --- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py +++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py @@ -4,7 +4,7 @@ from unittest.mock import MagicMock import pytest -from ymmsl import Reference, Checkpoints, CheckpointRules +from ymmsl import Reference, Checkpoints, CheckpointRangeRule from libmuscle.communicator import Message from libmuscle.snapshot import SnapshotMetadata @@ -42,7 +42,7 @@ def test_save_load_checkpoint(tmp_path: Path) -> None: instance_id = Reference('test[1]') snapshot_manager = SnapshotManager(instance_id, manager, communicator) - checkpoints = Checkpoints(simulation_time=CheckpointRules(every=1)) + checkpoints = Checkpoints(simulation_time=[CheckpointRangeRule(every=1)]) snapshot_manager.set_checkpoint_info( datetime.now(timezone.utc), checkpoints, None) From 2ab52946f994ee92c3b97e9f41246a98adb8bc2d Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 1 Sep 2022 17:31:11 +0200 Subject: [PATCH 034/183] Add checkpoint info in register_instance response --- libmuscle/cpp/src/libmuscle/mmp_client.cpp | 2 +- libmuscle/python/libmuscle/manager/manager.py | 2 +- .../python/libmuscle/manager/mmp_server.py | 68 ++++++++++++++++--- .../python/libmuscle/manager/test/conftest.py | 20 +++--- .../manager/test/test_mmp_request_handler.py | 21 +++--- libmuscle/python/libmuscle/mmp_client.py | 50 ++++++++++++-- .../python/libmuscle/test/test_mmp_client.py | 6 +- 7 files changed, 132 insertions(+), 37 deletions(-) diff --git a/libmuscle/cpp/src/libmuscle/mmp_client.cpp b/libmuscle/cpp/src/libmuscle/mmp_client.cpp index acd6672d..de50e894 100644 --- a/libmuscle/cpp/src/libmuscle/mmp_client.cpp +++ b/libmuscle/cpp/src/libmuscle/mmp_client.cpp @@ -105,7 +105,7 @@ void MMPClient::register_instance( auto response = call_manager_(request); - if (response.size() > 1) + if (response[0].as() == static_cast(ResponseType::error)) throw std::runtime_error( "Error registering instance: " + response[1].as()); } diff --git a/libmuscle/python/libmuscle/manager/manager.py b/libmuscle/python/libmuscle/manager/manager.py index c8aa76d4..21f21c60 100644 --- a/libmuscle/python/libmuscle/manager/manager.py +++ b/libmuscle/python/libmuscle/manager/manager.py @@ -58,7 +58,7 @@ def __init__( pass self._server = MMPServer( - self._logger, self._configuration.settings, + self._logger, self._configuration, self._instance_registry, self._topology_store) if self._instance_manager: diff --git a/libmuscle/python/libmuscle/manager/mmp_server.py b/libmuscle/python/libmuscle/manager/mmp_server.py index 0f66690a..04a99680 100644 --- a/libmuscle/python/libmuscle/manager/mmp_server.py +++ b/libmuscle/python/libmuscle/manager/mmp_server.py @@ -1,9 +1,12 @@ +from datetime import datetime, timezone import errno import logging -from typing import Any, cast, Generator, List +from typing import Any, Dict, Optional, Tuple, cast, Generator, List import msgpack -from ymmsl import Conduit, Identifier, Operator, Port, Reference, Settings +from ymmsl import ( + Conduit, Identifier, Operator, Port, Reference, PartialConfiguration, + Checkpoints) from libmuscle.logging import LogLevel from libmuscle.manager.instance_registry import ( @@ -19,6 +22,8 @@ _logger = logging.getLogger(__name__) +_EncodedCheckpointType = Dict[str, List[Dict[str, Any]]] + def decode_operator(data: str) -> Operator: """Create an Operator from a MsgPack-compatible value.""" @@ -35,12 +40,20 @@ def encode_conduit(conduit: Conduit) -> List[str]: return [str(conduit.sender), str(conduit.receiver)] +def encode_checkpoints(checkpoints: Checkpoints) -> _EncodedCheckpointType: + """Convert a Checkpoins to a MsgPack-compatible value.""" + return { + "wallclock_time": [vars(rule) for rule in checkpoints.wallclock_time], + "simulation_time": [vars(rule) for rule in checkpoints.simulation_time] + } + + class MMPRequestHandler(RequestHandler): """Handles Manager requests.""" def __init__( self, logger: Logger, - settings: Settings, + configuration: PartialConfiguration, instance_registry: InstanceRegistry, topology_store: TopologyStore): """Create an MMPRequestHandler. @@ -52,9 +65,10 @@ def __init__( topology_store: Keeps track of how to connect things. """ self._logger = logger - self._settings = settings + self._configuration = configuration self._instance_registry = instance_registry self._topology_store = topology_store + self._reference_time = datetime.now(timezone.utc) def handle_request(self, request: bytes) -> bytes: """Handles a manager request. @@ -98,14 +112,22 @@ def _register_instance( status (ResponseType): SUCCESS or ERROR error_msg (str): An error message, only present if status equals ERROR + checkpoint_info (Tuple[str, bytes, Optional[str]]): Checkpoint info, + only present if status equals SUCCESS. The first item is an + ISO8601 encoding of the wallclock reference time (see + :meth:`datetime.datetime.isoformat`). The second item is a + yaml-encoded ymmsl.Checkpoints object. The final item is the + checkpoint filename that the registered instance should resume + from, or None if no resume is requested. """ port_objs = [decode_port(p) for p in ports] + instance = Reference(instance_id) try: - self._instance_registry.add( - Reference(instance_id), locations, port_objs) + self._instance_registry.add(instance, locations, port_objs) _logger.info(f'Registered instance {instance_id}') - return [ResponseType.SUCCESS.value] + checkpoint_info = self._get_checkpoint_info(instance) + return [ResponseType.SUCCESS.value, checkpoint_info] except AlreadyRegistered: return [ ResponseType.ERROR.value, @@ -202,7 +224,7 @@ def _get_settings(self) -> Any: """ return [ ResponseType.SUCCESS.value, - self._settings.as_ordered_dict()] + self._configuration.settings.as_ordered_dict()] def _submit_log_message( self, instance_id: str, timestamp: float, level: int, text: str @@ -261,6 +283,29 @@ def _generate_peer_instances( for peer_indices in generate_indices(peer_dims[len(dims):]): yield base + peer_indices + def _get_checkpoint_info( + self, + instance: Reference + ) -> Tuple[str, _EncodedCheckpointType, Optional[str]]: + """Get checkpoint info for an instance + + Args: + instance: The instance whose checkpoint info to get + + Returns: + wallclock_reference_time: :meth:`datetime.datetime.isoformat` + encoded UTC reference for wallclock time = 0 + checkpoints: yaml-encoded ymmsl.Checkpoints object + resume: path of the snapshot file to resume from (or None if not + resuming) + """ + resume = None + if instance in self._configuration.resume: + resume = str(self._configuration.resume[instance]) + return (self._reference_time.isoformat(), + encode_checkpoints(self._configuration.checkpoints), + resume) + class MMPServer: """The MUSCLE Manager Protocol server. @@ -272,7 +317,7 @@ class MMPServer: def __init__( self, logger: Logger, - settings: Settings, + configuration: PartialConfiguration, instance_registry: InstanceRegistry, topology_store: TopologyStore ) -> None: @@ -285,13 +330,14 @@ def __init__( Args: logger: Logger to send log messages to - settings: Settings component to get settings from + configuration: Configuration component to get settings, checkpoints + and resumes from instance_registry: To register instances with and get peer locations from topology_store: To get peers and conduits from """ self._handler = MMPRequestHandler( - logger, settings, instance_registry, topology_store) + logger, configuration, instance_registry, topology_store) try: self._server = TcpTransportServer(self._handler, 9000) except OSError as e: diff --git a/libmuscle/python/libmuscle/manager/test/conftest.py b/libmuscle/python/libmuscle/manager/test/conftest.py index 9ba095dd..433e23b1 100644 --- a/libmuscle/python/libmuscle/manager/test/conftest.py +++ b/libmuscle/python/libmuscle/manager/test/conftest.py @@ -2,7 +2,7 @@ import pytest from ymmsl import (Component, Conduit, Configuration, Model, Reference, - Settings) + PartialConfiguration) from libmuscle.manager.instance_registry import InstanceRegistry from libmuscle.manager.logger import Logger @@ -18,8 +18,8 @@ def logger(tmpdir): @pytest.fixture -def settings(): - return Settings() +def mmp_configuration(): + return PartialConfiguration() @pytest.fixture @@ -45,9 +45,10 @@ def topology_store() -> TopologyStore: @pytest.fixture -def mmp_request_handler(logger, settings, instance_registry, topology_store): +def mmp_request_handler( + logger, mmp_configuration, instance_registry, topology_store): return MMPRequestHandler( - logger, settings, instance_registry, topology_store) + logger, mmp_configuration, instance_registry, topology_store) @pytest.fixture @@ -63,9 +64,9 @@ def loaded_instance_registry(instance_registry): @pytest.fixture def registered_mmp_request_handler( - logger, settings, loaded_instance_registry, topology_store): + logger, mmp_configuration, loaded_instance_registry, topology_store): return MMPRequestHandler( - logger, settings, loaded_instance_registry, topology_store) + logger, mmp_configuration, loaded_instance_registry, topology_store) @pytest.fixture @@ -109,6 +110,7 @@ def loaded_instance_registry2(): @pytest.fixture def registered_mmp_request_handler2( - logger, settings, loaded_instance_registry2, topology_store2): + logger, mmp_configuration, loaded_instance_registry2, topology_store2): return MMPRequestHandler( - logger, settings, loaded_instance_registry2, topology_store2) + logger, mmp_configuration, + loaded_instance_registry2, topology_store2) diff --git a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py index 6c40e02e..733baa61 100644 --- a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py +++ b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py @@ -6,9 +6,10 @@ from libmuscle.mcp.protocol import RequestType, ResponseType -def test_create_servicer(logger, settings, instance_registry, +def test_create_servicer(logger, mmp_configuration, instance_registry, topology_store): - MMPRequestHandler(logger, settings, instance_registry, topology_store) + MMPRequestHandler( + logger, mmp_configuration, instance_registry, topology_store) def test_log_message(mmp_request_handler, caplog): @@ -31,7 +32,7 @@ def test_log_message(mmp_request_handler, caplog): assert caplog.records[0].message == 'Testing log message' -def test_get_settings(settings, mmp_request_handler): +def test_get_settings(mmp_configuration, mmp_request_handler): request = [RequestType.GET_SETTINGS.value] encoded_request = msgpack.packb(request, use_bin_type=True) @@ -42,12 +43,12 @@ def test_get_settings(settings, mmp_request_handler): assert decoded_result[0] == ResponseType.SUCCESS.value assert decoded_result[1] == {} - settings['test1'] = 13 - settings['test2'] = 12.3 - settings['test3'] = 'testing' - settings['test4'] = True - settings['test5'] = [2.3, 7.4] - settings['test6'] = [[1.0, 2.0], [2.0, 1.0]] + mmp_configuration.settings['test1'] = 13 + mmp_configuration.settings['test2'] = 12.3 + mmp_configuration.settings['test3'] = 'testing' + mmp_configuration.settings['test4'] = True + mmp_configuration.settings['test5'] = [2.3, 7.4] + mmp_configuration.settings['test6'] = [[1.0, 2.0], [2.0, 1.0]] result = mmp_request_handler.handle_request(encoded_request) decoded_result = msgpack.unpackb(result, raw=False) @@ -63,7 +64,7 @@ def test_get_settings(settings, mmp_request_handler): assert result_dict['test4'] is True assert result_dict['test5'] == [2.3, 7.4] assert result_dict['test6'] == [[1.0, 2.0], [2.0, 1.0]] - assert result_dict == settings.as_ordered_dict() + assert result_dict == mmp_configuration.settings.as_ordered_dict() def test_register_instance(mmp_request_handler, instance_registry): diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py index de6a6897..685ae888 100644 --- a/libmuscle/python/libmuscle/mmp_client.py +++ b/libmuscle/python/libmuscle/mmp_client.py @@ -5,7 +5,9 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple import msgpack -from ymmsl import Conduit, Operator, Port, Reference, Settings, Checkpoints +from ymmsl import ( + Conduit, Operator, Port, Reference, Settings, Checkpoints, + CheckpointRule, CheckpointRangeRule, CheckpointAtRule) from libmuscle.mcp.protocol import RequestType, ResponseType from libmuscle.mcp.tcp_transport_client import TcpTransportClient @@ -48,6 +50,42 @@ def encode_profile_event(event: ProfileEvent) -> Any: event.message_size] +def decode_checkpoint_rule(rule: Dict[str, Any]) -> CheckpointRule: + """Decode a checkpoint rule from a MsgPack-compatible value.""" + if rule.keys() == {'in'}: + return CheckpointAtRule(**rule) + if rule.keys() == {'start', 'stop', 'every'}: + return CheckpointRangeRule(**rule) + raise ValueError('Cannot convert {rule} to a checkpoint rule.') + + +def decode_checkpoint_info( + iso_walltime_reference: str, + checkpoints_dict: Dict[str, List[Dict[str, Any]]], + resume: Optional[str] + ) -> Tuple[datetime, Checkpoints, Optional[Path]]: + """Decode checkpoint info from a MsgPack-compatible value. + + Args: + iso_walltime_reference: iso8601 string generated by datetime.isoformat + checkpoints_dict: dictionary of checkpoint definitions + resume: optional string indicating resume path + + Returns: + wallclock_time_reference: UTC time where wallclock_time = 0 + checkpoints: checkpoint configuration + resume: path to the resume snapshot + """ + wallclock_time_reference = datetime.fromisoformat(iso_walltime_reference) + checkpoints = Checkpoints( + wallclock_time=[decode_checkpoint_rule(rule) + for rule in checkpoints_dict["wallclock_time"]], + simulation_time=[decode_checkpoint_rule(rule) + for rule in checkpoints_dict["simulation_time"]]) + resume_path = None if resume is None else Path(resume) + return (wallclock_time_reference, checkpoints, resume_path) + + class MMPClient(): """The client for the MUSCLE Manager Protocol. @@ -120,17 +158,21 @@ def register_instance(self, name: Reference, locations: List[str], locations: List of places where the instance can be reached. ports: List of ports of this instance. + + Returns: + wallclock_time_reference: UTC time where wallclock_time = 0 + checkpoints: checkpoint configuration + resume: path to the resume snapshot """ request = [ RequestType.REGISTER_INSTANCE.value, str(name), locations, [encode_port(p) for p in ports]] response = self._call_manager(request) - if len(response) > 1: + if response[0] == ResponseType.ERROR.value: raise RuntimeError( f'Error registering instance: {response[1]}') - # TODO - return (datetime.now(), Checkpoints(), None) + return decode_checkpoint_info(*response[1]) def request_peers( self, name: Reference) -> Tuple[ diff --git a/libmuscle/python/libmuscle/test/test_mmp_client.py b/libmuscle/python/libmuscle/test/test_mmp_client.py index d5051962..be098d61 100644 --- a/libmuscle/python/libmuscle/test/test_mmp_client.py +++ b/libmuscle/python/libmuscle/test/test_mmp_client.py @@ -1,3 +1,4 @@ +from datetime import datetime, timezone from unittest.mock import patch import msgpack @@ -73,7 +74,10 @@ def test_get_settings(mocked_mmp_client) -> None: def test_register_instance(mocked_mmp_client) -> None: client, stub = mocked_mmp_client - result = [ResponseType.SUCCESS.value] + result = [ResponseType.SUCCESS.value, + (datetime.now(timezone.utc).isoformat(), + {'wallclock_time': [], 'simulation_time': []}, + None)] stub.call.return_value = msgpack.packb(result, use_bin_type=True) client.register_instance( From ab9fbd5aa3b3c9a6a0e86134f3713b35903436e6 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 2 Sep 2022 10:09:15 +0200 Subject: [PATCH 035/183] Send reference time as tuple instead of ISO string --- .../python/libmuscle/manager/mmp_server.py | 24 ++++++++++++------- libmuscle/python/libmuscle/mmp_client.py | 11 +++++---- .../python/libmuscle/test/test_mmp_client.py | 6 ++++- 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/libmuscle/python/libmuscle/manager/mmp_server.py b/libmuscle/python/libmuscle/manager/mmp_server.py index 04a99680..b237f316 100644 --- a/libmuscle/python/libmuscle/manager/mmp_server.py +++ b/libmuscle/python/libmuscle/manager/mmp_server.py @@ -22,6 +22,7 @@ _logger = logging.getLogger(__name__) +_EncodedTimeType = Tuple[int, int, int, int, int, int, int] _EncodedCheckpointType = Dict[str, List[Dict[str, Any]]] @@ -68,7 +69,10 @@ def __init__( self._configuration = configuration self._instance_registry = instance_registry self._topology_store = topology_store - self._reference_time = datetime.now(timezone.utc) + reftime = datetime.now(timezone.utc) + self._reference_time_tuple = (reftime.year, reftime.month, reftime.day, + reftime.hour, reftime.minute, + reftime.second, reftime.microsecond) def handle_request(self, request: bytes) -> bytes: """Handles a manager request. @@ -113,10 +117,10 @@ def _register_instance( error_msg (str): An error message, only present if status equals ERROR checkpoint_info (Tuple[str, bytes, Optional[str]]): Checkpoint info, - only present if status equals SUCCESS. The first item is an - ISO8601 encoding of the wallclock reference time (see - :meth:`datetime.datetime.isoformat`). The second item is a - yaml-encoded ymmsl.Checkpoints object. The final item is the + only present if status equals SUCCESS. The first item is a tuple + encoding of the wallclock reference time (year, month, day, + hour, minute, second, microsecond) in UTC. The second item is a + dict encoding a ymmsl.Checkpoints object. The final item is the checkpoint filename that the registered instance should resume from, or None if no resume is requested. """ @@ -286,15 +290,17 @@ def _generate_peer_instances( def _get_checkpoint_info( self, instance: Reference - ) -> Tuple[str, _EncodedCheckpointType, Optional[str]]: + ) -> Tuple[_EncodedTimeType, + _EncodedCheckpointType, + Optional[str]]: """Get checkpoint info for an instance Args: instance: The instance whose checkpoint info to get Returns: - wallclock_reference_time: :meth:`datetime.datetime.isoformat` - encoded UTC reference for wallclock time = 0 + wallclock_reference_time: tuple encoding UTC reference for wallclock + time = 0: (year, month, day, hour, minute, second, microsecond) checkpoints: yaml-encoded ymmsl.Checkpoints object resume: path of the snapshot file to resume from (or None if not resuming) @@ -302,7 +308,7 @@ def _get_checkpoint_info( resume = None if instance in self._configuration.resume: resume = str(self._configuration.resume[instance]) - return (self._reference_time.isoformat(), + return (self._reference_time_tuple, encode_checkpoints(self._configuration.checkpoints), resume) diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py index 685ae888..f8f05330 100644 --- a/libmuscle/python/libmuscle/mmp_client.py +++ b/libmuscle/python/libmuscle/mmp_client.py @@ -1,4 +1,4 @@ -from datetime import datetime +from datetime import datetime, timezone from pathlib import Path from random import uniform from time import perf_counter, sleep @@ -60,14 +60,15 @@ def decode_checkpoint_rule(rule: Dict[str, Any]) -> CheckpointRule: def decode_checkpoint_info( - iso_walltime_reference: str, + utc_walltime_reference: Tuple[int, int, int, int, int, int, int], checkpoints_dict: Dict[str, List[Dict[str, Any]]], resume: Optional[str] ) -> Tuple[datetime, Checkpoints, Optional[Path]]: """Decode checkpoint info from a MsgPack-compatible value. Args: - iso_walltime_reference: iso8601 string generated by datetime.isoformat + utc_walltime_reference: tuple (year, month, day, hour, minute, second, + microsecond) in UTC timezone checkpoints_dict: dictionary of checkpoint definitions resume: optional string indicating resume path @@ -76,14 +77,14 @@ def decode_checkpoint_info( checkpoints: checkpoint configuration resume: path to the resume snapshot """ - wallclock_time_reference = datetime.fromisoformat(iso_walltime_reference) + ref_time = datetime(*utc_walltime_reference, tzinfo=timezone.utc) checkpoints = Checkpoints( wallclock_time=[decode_checkpoint_rule(rule) for rule in checkpoints_dict["wallclock_time"]], simulation_time=[decode_checkpoint_rule(rule) for rule in checkpoints_dict["simulation_time"]]) resume_path = None if resume is None else Path(resume) - return (wallclock_time_reference, checkpoints, resume_path) + return (ref_time, checkpoints, resume_path) class MMPClient(): diff --git a/libmuscle/python/libmuscle/test/test_mmp_client.py b/libmuscle/python/libmuscle/test/test_mmp_client.py index be098d61..e93d6014 100644 --- a/libmuscle/python/libmuscle/test/test_mmp_client.py +++ b/libmuscle/python/libmuscle/test/test_mmp_client.py @@ -74,8 +74,12 @@ def test_get_settings(mocked_mmp_client) -> None: def test_register_instance(mocked_mmp_client) -> None: client, stub = mocked_mmp_client + reftime = datetime.now(timezone.utc) + reference_time_tuple = (reftime.year, reftime.month, reftime.day, + reftime.hour, reftime.minute, + reftime.second, reftime.microsecond) result = [ResponseType.SUCCESS.value, - (datetime.now(timezone.utc).isoformat(), + (reference_time_tuple, {'wallclock_time': [], 'simulation_time': []}, None)] stub.call.return_value = msgpack.packb(result, use_bin_type=True) From dddc630c0bc0059013e072e2541bf2dbf2109448 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 2 Sep 2022 11:03:06 +0200 Subject: [PATCH 036/183] Send reference wallclock time as timestamp --- .../python/libmuscle/manager/mmp_server.py | 13 ++---- .../manager/test/test_mmp_request_handler.py | 40 ++++++++++++++++++- libmuscle/python/libmuscle/mmp_client.py | 8 ++-- .../python/libmuscle/test/test_mmp_client.py | 6 +-- 4 files changed, 48 insertions(+), 19 deletions(-) diff --git a/libmuscle/python/libmuscle/manager/mmp_server.py b/libmuscle/python/libmuscle/manager/mmp_server.py index b237f316..793bbfc8 100644 --- a/libmuscle/python/libmuscle/manager/mmp_server.py +++ b/libmuscle/python/libmuscle/manager/mmp_server.py @@ -22,7 +22,6 @@ _logger = logging.getLogger(__name__) -_EncodedTimeType = Tuple[int, int, int, int, int, int, int] _EncodedCheckpointType = Dict[str, List[Dict[str, Any]]] @@ -69,10 +68,8 @@ def __init__( self._configuration = configuration self._instance_registry = instance_registry self._topology_store = topology_store - reftime = datetime.now(timezone.utc) - self._reference_time_tuple = (reftime.year, reftime.month, reftime.day, - reftime.hour, reftime.minute, - reftime.second, reftime.microsecond) + self._reference_time = datetime.now(timezone.utc) + self._reference_timestamp = self._reference_time.timestamp() def handle_request(self, request: bytes) -> bytes: """Handles a manager request. @@ -290,9 +287,7 @@ def _generate_peer_instances( def _get_checkpoint_info( self, instance: Reference - ) -> Tuple[_EncodedTimeType, - _EncodedCheckpointType, - Optional[str]]: + ) -> Tuple[float, _EncodedCheckpointType, Optional[str]]: """Get checkpoint info for an instance Args: @@ -308,7 +303,7 @@ def _get_checkpoint_info( resume = None if instance in self._configuration.resume: resume = str(self._configuration.resume[instance]) - return (self._reference_time_tuple, + return (self._reference_timestamp, encode_checkpoints(self._configuration.checkpoints), resume) diff --git a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py index 733baa61..0d91c650 100644 --- a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py +++ b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py @@ -1,5 +1,8 @@ +from datetime import datetime, timezone +from pathlib import Path import msgpack -from ymmsl import Operator, Reference +from ymmsl import ( + Operator, Reference, Checkpoints, CheckpointRangeRule, CheckpointAtRule) from libmuscle.logging import LogLevel from libmuscle.manager.mmp_server import MMPRequestHandler @@ -87,6 +90,41 @@ def test_register_instance(mmp_request_handler, instance_registry): assert registered_ports['test_instance'][0].operator == Operator.F_INIT +def test_register_instance_checkpoint_info( + mmp_configuration, mmp_request_handler): + resume_path = Path('/path/to/resume.pack') + mmp_configuration.resume = {Reference('test_instance'): resume_path} + mmp_configuration.checkpoints = Checkpoints([CheckpointRangeRule(every=10), + CheckpointAtRule([1, 2, 3.0])]) + + request = [ + RequestType.REGISTER_INSTANCE.value, + 'test_instance', + ['tcp://localhost:10000'], + [['test_in', 'F_INIT']]] + encoded_request = msgpack.packb(request, use_bin_type=True) + + result = mmp_request_handler.handle_request(encoded_request) + decoded_result = msgpack.unpackb(result, raw=False) + + assert decoded_result[0] == ResponseType.SUCCESS.value + timestamp, checkpoints, resume = decoded_result[1] + + ref_time = datetime.fromtimestamp(timestamp, tz=timezone.utc) + assert ref_time == mmp_request_handler._reference_time + + assert isinstance(checkpoints, dict) + assert checkpoints.keys() == {'wallclock_time', 'simulation_time'} + assert checkpoints['simulation_time'] == [] + wallclock_time = checkpoints['wallclock_time'] + assert len(wallclock_time) == 2 + assert wallclock_time[0] == {'start': None, 'stop': None, 'every': 10} + assert wallclock_time[1] == {'at': [1, 2, 3.0]} + + assert resume is not None + assert Path(resume) == resume_path + + def test_double_register_instance(mmp_request_handler): request = [ RequestType.REGISTER_INSTANCE.value, diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py index f8f05330..6376aa20 100644 --- a/libmuscle/python/libmuscle/mmp_client.py +++ b/libmuscle/python/libmuscle/mmp_client.py @@ -60,15 +60,15 @@ def decode_checkpoint_rule(rule: Dict[str, Any]) -> CheckpointRule: def decode_checkpoint_info( - utc_walltime_reference: Tuple[int, int, int, int, int, int, int], + reference_timestamp: float, checkpoints_dict: Dict[str, List[Dict[str, Any]]], resume: Optional[str] ) -> Tuple[datetime, Checkpoints, Optional[Path]]: """Decode checkpoint info from a MsgPack-compatible value. Args: - utc_walltime_reference: tuple (year, month, day, hour, minute, second, - microsecond) in UTC timezone + reference_timestamp: seconds since UNIX epoch in UTC timezone to use as + wallclock_time = 0 checkpoints_dict: dictionary of checkpoint definitions resume: optional string indicating resume path @@ -77,7 +77,7 @@ def decode_checkpoint_info( checkpoints: checkpoint configuration resume: path to the resume snapshot """ - ref_time = datetime(*utc_walltime_reference, tzinfo=timezone.utc) + ref_time = datetime.fromtimestamp(reference_timestamp, tz=timezone.utc) checkpoints = Checkpoints( wallclock_time=[decode_checkpoint_rule(rule) for rule in checkpoints_dict["wallclock_time"]], diff --git a/libmuscle/python/libmuscle/test/test_mmp_client.py b/libmuscle/python/libmuscle/test/test_mmp_client.py index e93d6014..a47311a6 100644 --- a/libmuscle/python/libmuscle/test/test_mmp_client.py +++ b/libmuscle/python/libmuscle/test/test_mmp_client.py @@ -74,12 +74,8 @@ def test_get_settings(mocked_mmp_client) -> None: def test_register_instance(mocked_mmp_client) -> None: client, stub = mocked_mmp_client - reftime = datetime.now(timezone.utc) - reference_time_tuple = (reftime.year, reftime.month, reftime.day, - reftime.hour, reftime.minute, - reftime.second, reftime.microsecond) result = [ResponseType.SUCCESS.value, - (reference_time_tuple, + (datetime.now(timezone.utc).timestamp(), {'wallclock_time': [], 'simulation_time': []}, None)] stub.call.return_value = msgpack.packb(result, use_bin_type=True) From 23f7f873d273430b73d15d0407450c70b71f8cb5 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 8 Sep 2022 15:10:22 +0200 Subject: [PATCH 037/183] Implementation of workflow snapshot heuristic --- .../libmuscle/manager/snapshot_registry.py | 438 ++++++++++++++++++ .../manager/test/test_snapshot_registry.py | 330 +++++++++++++ 2 files changed, 768 insertions(+) create mode 100644 libmuscle/python/libmuscle/manager/snapshot_registry.py create mode 100644 libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py new file mode 100644 index 00000000..1fd4b12d --- /dev/null +++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py @@ -0,0 +1,438 @@ +from dataclasses import dataclass, field +from enum import Flag, auto +from itertools import chain, zip_longest +from operator import attrgetter +from typing import Dict, Optional, Set, List, Tuple, TypeVar + +from ymmsl import Reference, Configuration, Identifier, Implementation +from ymmsl import ImplementationState as IState + +from libmuscle.snapshot import SnapshotMetadata + + +_SnapshotDictType = Dict[Reference, List["SnapshotNode"]] +_ConnectionType = Tuple[Identifier, Identifier, "_ConnectionInfo"] +_T = TypeVar("_T") + + +def safe_get(lst: List[_T], index: int, default: _T) -> _T: + """Get an item from the list, returning default when it does not exist. + + Args: + lst: List to get the item from + index: Which item to get, should be >= 0 + default: Value to return when hitting an IndexError + """ + try: + return lst[index] + except IndexError: + return default + + +class _ConnectionInfo(Flag): + SELF_IS_SENDING = auto() + SELF_IS_VECTOR = auto() + PEER_IS_VECTOR = auto() + + +def calc_consistency(num1: int, num2: int, first_is_sent: bool) -> bool: + """Calculate consistency of message counts. + + Args: + num1: message count of instance 1 + num2: message count of instance 2 + first_is_sent: True iff instance 1 is sending messages over this conduit + + Returns: + True iff the two message counts are consistent + """ + return (num1 == num2 or # strong + num1 + 1 == num2 and first_is_sent or # weak (1 = sent) + num2 + 1 == num1 and not first_is_sent) # weak (2 = sent) + + +def calc_consistency_list( + num1: List[int], num2: List[int], first_is_sent: bool) -> bool: + """Calculate consistency of message counts. + + Args: + num1: message count of instance 1 + num2: message count of instance 2 + first_is_sent: True iff instance 1 is sending messages over this conduit + + Returns: + True iff the two message counts are consistent + """ + if first_is_sent: + slot_iter = zip_longest(num1, num2, fillvalue=0) + else: + slot_iter = zip_longest(num2, num1, fillvalue=0) + return all(slot_sent == slot_received or # strong + slot_sent + 1 == slot_received # weak + for slot_sent, slot_received in slot_iter) + + +@dataclass +class SnapshotNode: + """Represents a node in the snapshot graph. + + Attributes: + num: The number of the snapshot. Unique for this instance. Later + snapshots always have a higher num. + instance: Which instance this is a snapshot of. + snapshot: The snapshot metadata reported by the instance. + stateful_peers: The set of peers that the instance is connected to that + have state, which we need to check consistency with. + consistent_peers: Keeps track of snapshots per peer that are consistent + with this one. + """ + num: int + instance: Reference + snapshot: SnapshotMetadata + stateful_peers: Set[Reference] + consistent_peers: Dict[Reference, List["SnapshotNode"]] = field( + default_factory=dict, repr=False) + + def __hash__(self) -> int: + return object.__hash__(self) + + @property + def consistent(self) -> bool: + """Returns True iff there is a consistent checkpoint will all stateful + peers. + """ + return self.consistent_peers.keys() == self.stateful_peers + + def do_consistency_check( + self, + peer_node: "SnapshotNode", + connections: List[_ConnectionType]) -> bool: + """Check if the snapshot of the peer is consistent with us. + + When the peer snapshot is consistent, adds it to our list of consistent + peer snapshots (in :attribute:`consistent_peers`) and vice versa. + + Args: + peer_node: Snapshot of one of our peers + connections: All connections from our instance to the peer instance + + Returns: + True iff the peer snapshot is consistent with ours. + """ + i_snapshot = self.snapshot + p_snapshot = peer_node.snapshot + for connection in connections: + i_port, p_port, conn = connection + is_sending = bool(conn & _ConnectionInfo.SELF_IS_SENDING) + i_msg_counts = i_snapshot.port_message_counts.get(str(i_port), []) + p_msg_counts = p_snapshot.port_message_counts.get(str(p_port), []) + if conn & _ConnectionInfo.SELF_IS_VECTOR: + slot = int(peer_node.instance[-1]) + consistent = calc_consistency( + safe_get(i_msg_counts, slot, 0), + safe_get(p_msg_counts, 0, 0), + is_sending) + elif conn & _ConnectionInfo.PEER_IS_VECTOR: + slot = int(self.instance[-1]) + consistent = calc_consistency( + safe_get(i_msg_counts, 0, 0), + safe_get(p_msg_counts, slot, 0), + is_sending) + else: + consistent = calc_consistency_list( + i_msg_counts, p_msg_counts, is_sending) + if not consistent: # not consistent + return False + self.consistent_peers.setdefault( + peer_node.instance, []).append(peer_node) + peer_node.consistent_peers.setdefault( + self.instance, []).append(self) + return True + + +class SnapshotRegistry: + """Registry of all snapshots taken by instances. + + Current snapshots are stored in a graph. Every node represents a snapshot + taken by an instance (see :class:`SnapshotNode`). When snapshots from peer + instances are consistent, the nodes are connected to each other. + + This class manages the snapshot nodes. New snapshots are registered through + :meth:`register_snapshot`. + """ + + def __init__(self, configuration: Configuration) -> None: + """Create a snapshot graph using provided configuration. + + Args: + configuration: ymmsl configuration describing the workflow. + """ + self._configuration = configuration + + self._snapshots = {} # type: _SnapshotDictType + + self._instances = set() # type: Set[Reference] + self._stateful_instances = set() # type: Set[Reference] + for component in configuration.model.components: + instances = set(component.instances()) + self._instances.update(instances) + if self._is_stateful(component.name): + self._stateful_instances.update(instances) + + def register_snapshot( + self, instance: Reference, snapshot: SnapshotMetadata) -> None: + """Register a new snapshot. + + Args: + instance: The instance that created the snapshot + snapshot: Metadata describing the snapshot + """ + stateful_peers = self._get_stateful_peers(instance) + + i_snapshots = self._snapshots.setdefault(instance, []) + # get next number of the snapshot + num = 1 if not i_snapshots else i_snapshots[-1].num + 1 + snapshotnode = SnapshotNode(num, instance, snapshot, stateful_peers) + i_snapshots.append(snapshotnode) + + # check consistency with all peers + for peer in stateful_peers: + for peer_snapshot in self._snapshots.get(peer, []): + snapshotnode.do_consistency_check( + peer_snapshot, self._get_connections(instance, peer)) + + # finally, check if this snapshotnode is now part of a workflow snapshot + self._save_workflow_snapshot(snapshotnode) + + def _save_workflow_snapshot(self, snapshotnode: SnapshotNode) -> None: + """Save snapshot if a workflow snapshot exists with the provided node. + + Args: + snapshotnode: The snapshot node that must be part of the workflow + snapshot. + """ + selected_snapshots = self._get_workflow_snapshot(snapshotnode) + if selected_snapshots is not None: + self._write_snapshot_ymmsl(selected_snapshots) + self._cleanup_snapshots(selected_snapshots) + + def _get_workflow_snapshot( + self, snapshot: SnapshotNode) -> Optional[List[SnapshotNode]]: + """Check if a workflow snapshot exists that contains the provided node. + + Note: if the provided snapshot node is part of multiple workflow + snapshots, only the most recent is detected and written to disk. + + Args: + snapshotnode: The snapshot node that must be part of the workflow + snapshot. + """ + # This implements a greedy assignment algorithm. + if not snapshot.consistent: + return None + + # Instances that don't have a snapshot node chosen yet: + instances_to_cover = list( + self._stateful_instances - {snapshot.instance}) + # Allowed snapshots per instance. This is updated during the heuristic + # to further restrict the sets of snapshots as peer snapshots are + # selected. + # First restriction is that the snapshots have to be locally consistent. + allowed_snapshots = {} # type: Dict[Reference, Set[SnapshotNode]] + for instance in instances_to_cover: + allowed_snapshots[instance] = set( + i_snapshot + for i_snapshot in self._snapshots.get(instance, []) + if i_snapshot.consistent) + if not allowed_snapshots[instance]: + # there cannot be a workflow snapshot if this instance has no + # consistent snapshot nodes + return None + instance = snapshot.instance + allowed_snapshots[instance] = {snapshot} + + def num_allowed_snapshots(instance: Reference) -> int: + """Get number of allowed snapshots at this point for this instance. + + The allowed snapshots are those that are consistent with all + selected snapshots at this point in the heuristic. + """ + return len(allowed_snapshots[instance]) + + selected_snapshots = [snapshot] + # This stack stores history of allowed_snapshots and enables roll back + stack = [] # type: List[Dict[Reference, Set[SnapshotNode]]] + + # update allowed_snapshots for peers + for peer, snapshots in snapshot.consistent_peers.items(): + allowed_snapshots[peer].intersection_update(snapshots) + if not allowed_snapshots[peer]: + return None + + while instances_to_cover: + # select most constrained instance + # + # Note: we're only interested in the instance with the least allowed + # snapshots. Better performance may be possible by not doing a full + # sort, but it should be tested. Expectation is that + # instances_to_cover remains mostly sorted (as the only counts that + # are changing are for peers of the previous selected instance). + # Python's sort algorithm is O(N) when the list is already sorted + # (which is the same as max()). + # + # We cannot use a priority queue (heapq) because + # num_allowed_snapshots is changing every iteration. + instances_to_cover.sort(key=num_allowed_snapshots, reverse=True) + instance = instances_to_cover.pop() + + # select latest snapshot of this instance + snapshot = max(allowed_snapshots[instance], key=attrgetter("num")) + selected_snapshots.append(snapshot) + # we put a shallow copy on the stack, so we are not allowed to + # modify the sets in the dictionary (see below) + stack.append(allowed_snapshots.copy()) + + # update allowed snapshots with the currently selected + allowed_snapshots[instance] = {snapshot} + for peer, snapshots in snapshot.consistent_peers.items(): + # not updating in place to preserve set objects in the stack + intersection = allowed_snapshots[peer].intersection(snapshots) + if not intersection: + break # roll back + allowed_snapshots[peer] = intersection + else: + # not rolling back, go into next iteration of the while-loop + continue + + # roll back should stop when selected_snapshots only contains the + # one we forced to be part of the workflow snapshot + while len(selected_snapshots) > 1: + # roll back + snapshot = selected_snapshots.pop() + instance = snapshot.instance + instances_to_cover.append(instance) + allowed_snapshots = stack.pop() + allowed_snapshots[instance].remove(snapshot) + if allowed_snapshots[instance]: + # we have a valid next snapshot to try for this instance + break + # no allowed_snapshots, try another roll back + else: + # we've exhausted roll back possibilities, there is no + # consistent checkpoint + return None + + return selected_snapshots + + def _write_snapshot_ymmsl( + self, selected_snapshot: List[SnapshotNode]) -> None: + ... + + def _cleanup_snapshots( + self, selected_snapshots: List[SnapshotNode]) -> None: + # remove all snapshots older than the selected ones + removed_snapshots = set() # type: Set[SnapshotNode] + for snapshot in selected_snapshots: + all_snapshots = self._snapshots[snapshot.instance] + idx = all_snapshots.index(snapshot) + self._snapshots[snapshot.instance] = all_snapshots[idx:] + removed_snapshots.update(all_snapshots[:idx]) + # remove all references in SnapshotNode.peer_snapshot to the snapshots + # that are cleaned up + for snapshot in removed_snapshots: + for peer_snapshot in chain.from_iterable( + snapshot.consistent_peers.values()): + if peer_snapshot in removed_snapshots: + # snapshot is removed anyway, no need to update references + continue + # peer_snapshot is still there, remove reference to us + peer_snapshot.consistent_peers[snapshot.instance].remove( + snapshot) + + # TODO: add caching decorator or move into an instance variable + def _get_stateful_peers(self, instance: Reference) -> Set[Reference]: + peers = set() # type: Set[Reference] + kernel = instance.without_trailing_ints() + index = [int(instance[i]) for i in range(len(kernel), len(instance))] + for conduit in self._configuration.model.conduits: + if conduit.sending_component() == kernel: + peer_kernel = conduit.receiving_component() + elif conduit.receiving_component() == kernel: + peer_kernel = conduit.sending_component() + else: + continue + if not self._is_stateful(peer_kernel): + continue + if len(index) == len(self._multiplicity(peer_kernel)): + # we must be sending to the peer with the same index as us + peers.add(peer_kernel + index) + elif len(index) + 1 == len(self._multiplicity(peer_kernel)): + # we are sending on a vector port, peer is receiving non-vector + # generate all peer indices + for i in range(self._multiplicity(peer_kernel)[-1]): + peers.add(peer_kernel + index + i) + elif len(index) - 1 == len(self._multiplicity(peer_kernel)): + # we are sending to a vector port, strip last of our indices + peers.add(peer_kernel + index[:-1]) + return peers + + # TODO: add caching decorator or move into an instance variable + def _get_connections(self, instance: Reference, peer: Reference + ) -> List[_ConnectionType]: + instance_kernel = instance.without_trailing_ints() + peer_kernel = peer.without_trailing_ints() + + connected_ports = [] # type: List[_ConnectionType] + for conduit in self._configuration.model.conduits: + if (conduit.sending_component() == instance_kernel and + conduit.receiving_component() == peer_kernel): + conn_type = _ConnectionInfo.SELF_IS_SENDING + elif (conduit.receiving_component() == instance_kernel and + conduit.sending_component() == peer_kernel): + conn_type = _ConnectionInfo(0) + else: + continue + instance_ndim = (len(instance) - len(instance_kernel)) + peer_ndim = (len(peer) - len(peer_kernel)) + if instance_ndim < peer_ndim: + conn_type |= _ConnectionInfo.SELF_IS_VECTOR + if instance_ndim > peer_ndim: + conn_type |= _ConnectionInfo.PEER_IS_VECTOR + # we cannot distinguish scalar-scalar vs. vector-vector + # but it does not matter for this logic :) + if conn_type & _ConnectionInfo.SELF_IS_SENDING: + connected_ports.append(( + conduit.sending_port(), + conduit.receiving_port(), + conn_type)) + else: + connected_ports.append(( + conduit.receiving_port(), + conduit.sending_port(), + conn_type)) + return connected_ports + + # TODO: add caching decorator or move into an instance variable + def _multiplicity(self, kernel: Reference) -> List[int]: + for component in self._configuration.model.components: + if component.name == kernel: + return component.multiplicity + raise KeyError(str(kernel)) + + # TODO: add caching decorator or move into an instance variable + def _implementation(self, kernel: Reference) -> Optional[Implementation]: + implementation = None + for component in self._configuration.model.components: + if component.name == kernel: + implementation = component.implementation + if implementation in self._configuration.implementations: + return self._configuration.implementations[implementation] + return None + + def _is_stateful(self, kernel: Reference) -> bool: + implementation = self._implementation(kernel) + if implementation is None: + return True # assume stateful + return (implementation.stateful is IState.STATEFUL or + implementation.stateful is IState.WEAKLY_STATEFUL and + implementation.supports_checkpoint) diff --git a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py new file mode 100644 index 00000000..7485b0a8 --- /dev/null +++ b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py @@ -0,0 +1,330 @@ +from unittest.mock import MagicMock + +import pytest +from libmuscle.snapshot import SnapshotMetadata +from ymmsl import ( + Configuration, Model, Component, Conduit, Implementation, + ImplementationState as IState, Reference) + +from libmuscle.manager.snapshot_registry import ( + SnapshotRegistry, calc_consistency, calc_consistency_list, safe_get, + _ConnectionInfo) + + +def make_snapshot(**msg_counts) -> SnapshotMetadata: + return SnapshotMetadata([], 0, 0, 0, {**msg_counts}, False, '') + + +@pytest.fixture(params=[True, False]) +def micro_is_stateless(request: pytest.FixtureRequest) -> bool: + return request.param + + +@pytest.fixture +def macro_micro(micro_is_stateless: bool) -> Configuration: + components = [ + Component('macro', 'macro_impl'), + Component('micro', 'micro_impl')] + conduits = [ + Conduit('macro.o_i', 'micro.f_i'), + Conduit('micro.o_f', 'macro.s')] + model = Model('macro_micro', components, conduits) + + if micro_is_stateless: + micro_impl = Implementation( + 'micro_impl', stateful=IState.STATELESS, executable='pass') + else: + micro_impl = Implementation( + 'micro_impl', supports_checkpoint=True, executable='pass') + + implementations = [ + Implementation( + 'macro_impl', supports_checkpoint=True, executable='pass'), + micro_impl] + + return Configuration(model, implementations=implementations) + + +@pytest.fixture +def uq(macro_micro: Configuration) -> Configuration: + for component in macro_micro.model.components: + component.multiplicity = [5] + macro_micro.model.components.append(Component('qmc', 'qmc_impl')) + macro_micro.model.components.append(Component('rr', 'rr_impl')) + macro_micro.model.conduits.extend([ + Conduit('qmc.parameters_out', 'rr.front_in'), + Conduit('rr.front_out', 'qmc.states_in'), + Conduit('rr.back_out', 'macro.muscle_settings_in'), + Conduit('macro.final_state_out', 'rr.back_in')]) + macro_micro.implementations[Reference('qmc_impl')] = Implementation( + 'qmc_impl', supports_checkpoint=True, executable='pass') + macro_micro.implementations[Reference('rr_impl')] = Implementation( + 'rr_impl', supports_checkpoint=True, executable='pass') + return macro_micro + + +def test_safe_get() -> None: + assert safe_get([], 0, 1) == 1 + assert safe_get([3], 0, 1) == 3 + assert safe_get([3], 1, 5) == 5 + for i in range(10): + expected = -1 if i >= 3 else i + 3 + assert safe_get([3, 4, 5], i, -1) == expected + + +def test_calc_consistency() -> None: + num_sent = 3 + for num_received in [2, 3, 4, 5]: + consistent = num_received in [3, 4] + assert calc_consistency(num_sent, num_received, True) is consistent + assert calc_consistency(num_received, num_sent, False) is consistent + + num_received = 10 + for num_sent in [8, 9, 10, 11]: + consistent = num_sent in [9, 10] + assert calc_consistency(num_sent, num_received, True) is consistent + assert calc_consistency(num_received, num_sent, False) is consistent + + +def test_calc_consistency_list() -> None: + num_sent = [3, 3] + for num_received in [[2, 3], [3, 2], [3, 5], [], [4, 4, 0, 0, 2]]: + assert not calc_consistency_list(num_sent, num_received, True) + assert not calc_consistency_list(num_received, num_sent, False) + for num_received in [[3, 3], [3, 4], [4, 3], [4, 4], + [3, 3, 1], [4, 4, 0, 0, 0, 1, 0, 1]]: + assert calc_consistency_list(num_sent, num_received, True) + assert calc_consistency_list(num_received, num_sent, False) + + +def test_stateful_peers(uq: Configuration, micro_is_stateless: bool) -> None: + snapshot_registry = SnapshotRegistry(uq) + macro = Reference('macro') + micro = Reference('micro') + qmc = Reference('qmc') + rr = Reference('rr') + + expected_stateful = {qmc, rr} | {macro + i for i in range(5)} + if not micro_is_stateless: + expected_stateful.update(micro + i for i in range(5)) + assert snapshot_registry._stateful_instances == expected_stateful + + assert snapshot_registry._get_stateful_peers(qmc) == {rr} + expected_rr_peers = {qmc} | {macro + i for i in range(5)} + assert snapshot_registry._get_stateful_peers(rr) == expected_rr_peers + for i in range(5): + expected_peers = {rr} if micro_is_stateless else {rr, micro + i} + assert snapshot_registry._get_stateful_peers(macro + i) == expected_peers + assert snapshot_registry._get_stateful_peers(micro + i) == {macro + i} + + +def test_connections(uq: Configuration) -> None: + snapshot_registry = SnapshotRegistry(uq) + macro = Reference('macro') + micro = Reference('micro') + qmc = Reference('qmc') + rr = Reference('rr') + + assert not snapshot_registry._get_connections(qmc, macro + 1) + assert not snapshot_registry._get_connections(macro + 3, qmc) + assert not snapshot_registry._get_connections(qmc, micro + 0) + assert not snapshot_registry._get_connections(micro + 1, qmc) + assert not snapshot_registry._get_connections(rr, micro + 4) + assert not snapshot_registry._get_connections(micro + 0, rr) + + connections = snapshot_registry._get_connections(rr, qmc) + assert len(connections) == 2 + for rr_port, qmc_port, info in connections: + assert rr_port in (Reference('front_out'), Reference('front_in')) + assert qmc_port in (Reference('parameters_out'), Reference('states_in')) + is_sending = bool(info & _ConnectionInfo.SELF_IS_SENDING) + assert is_sending is (rr_port == Reference('front_out')) + # Note: actually both are vector ports, but this is undetectable from + # the ymmsl configuration. Luckily we treat it the same as scalar-scalar + assert not (info & _ConnectionInfo.SELF_IS_VECTOR) + assert not (info & _ConnectionInfo.PEER_IS_VECTOR) + + connections = snapshot_registry._get_connections(macro + 0, rr) + assert len(connections) == 2 + for macro_port, rr_port, info in connections: + assert macro_port in ( + Reference('muscle_settings_in'), Reference('final_state_out')) + assert rr_port in (Reference('back_out'), Reference('back_in')) + is_sending = bool(info & _ConnectionInfo.SELF_IS_SENDING) + assert is_sending is (macro_port == Reference('final_state_out')) + assert not (info & _ConnectionInfo.SELF_IS_VECTOR) + assert (info & _ConnectionInfo.PEER_IS_VECTOR) + + connections = snapshot_registry._get_connections(rr, macro + 1) + assert len(connections) == 2 + for rr_port, macro_port, info in connections: + assert macro_port in ( + Reference('muscle_settings_in'), Reference('final_state_out')) + assert rr_port in (Reference('back_out'), Reference('back_in')) + is_sending = bool(info & _ConnectionInfo.SELF_IS_SENDING) + assert is_sending is (rr_port == Reference('back_out')) + assert (info & _ConnectionInfo.SELF_IS_VECTOR) + assert not (info & _ConnectionInfo.PEER_IS_VECTOR) + + +def test_macro_micro_snapshots( + macro_micro: Configuration, micro_is_stateless: bool) -> None: + snapshot_registry = SnapshotRegistry(macro_micro) + # prevent actually writing a ymmsl file, testing that separately + snapshot_registry._write_snapshot_ymmsl = MagicMock() + macro = Reference('macro') + micro = Reference('micro') + + macro_snapshot = make_snapshot(o_i=[3], s=[3]) + snapshot_registry.register_snapshot(macro, macro_snapshot) + + assert len(snapshot_registry._snapshots[macro]) == 1 + node = snapshot_registry._snapshots[macro][0] + assert node.consistent is micro_is_stateless + assert node.consistent_peers == {} + assert node.instance == macro + assert node.num == 1 + assert node.snapshot is macro_snapshot + if micro_is_stateless: + assert node.stateful_peers == set() + snapshot_registry._write_snapshot_ymmsl.assert_called_once_with([node]) + snapshot_registry._write_snapshot_ymmsl.reset_mock() + else: + assert node.stateful_peers == {micro} + snapshot_registry._write_snapshot_ymmsl.assert_not_called() + + if not micro_is_stateless: + # Note: this snapshot is not realistic, it should have come in before + # the macro snapshot above. However, it's still useful for testing the + # consistency algorithm + micro_snapshot = make_snapshot(f_i=[2], o_f=[1]) + snapshot_registry.register_snapshot(micro, micro_snapshot) + + assert len(snapshot_registry._snapshots[micro]) == 1 + assert not snapshot_registry._snapshots[micro][0].consistent + snapshot_registry._write_snapshot_ymmsl.assert_not_called() + + micro_snapshot = make_snapshot(f_i=[3], o_f=[2]) + snapshot_registry.register_snapshot(micro, micro_snapshot) + + # micro snapshots should be cleaned up now! + assert len(snapshot_registry._snapshots[micro]) == 1 + micro_node = snapshot_registry._snapshots[micro][0] + assert micro_node.consistent + snapshot_registry._write_snapshot_ymmsl.assert_called_with( + [micro_node, node]) + snapshot_registry._write_snapshot_ymmsl.reset_mock() + + micro_snapshot = make_snapshot(f_i=[4], o_f=[3]) + snapshot_registry.register_snapshot(micro, micro_snapshot) + + # micro snapshots should be cleaned up now! + assert len(snapshot_registry._snapshots[micro]) == 1 + micro_node = snapshot_registry._snapshots[micro][0] + assert micro_node.consistent + snapshot_registry._write_snapshot_ymmsl.assert_called_with( + [micro_node, node]) + snapshot_registry._write_snapshot_ymmsl.reset_mock() + + macro_snapshot = make_snapshot(o_i=[4], s=[4]) + snapshot_registry.register_snapshot(macro, macro_snapshot) + snapshot_registry._write_snapshot_ymmsl.assert_called_once() + + +def test_uq(uq: Configuration, micro_is_stateless: bool) -> None: + snapshot_registry = SnapshotRegistry(uq) + # prevent actually writing a ymmsl file, testing that separately + snapshot_registry._write_snapshot_ymmsl = MagicMock() + macro = Reference('macro') + micro = Reference('micro') + qmc = Reference('qmc') + rr = Reference('rr') + + qmc_snapshot = make_snapshot(parameters_out=[], states_in=[]) + snapshot_registry.register_snapshot(qmc, qmc_snapshot) + + rr_snapshot = make_snapshot( + front_in=[1, 1, 1, 1, 1, 0, 0, 0, 0, 0], + front_out=[0] * 10, + back_out=[1, 1, 1, 1, 1], + back_in=[0] * 5) + snapshot_registry.register_snapshot(rr, rr_snapshot) + node = snapshot_registry._snapshots[rr][-1] + assert qmc in node.consistent_peers + snapshot_registry._write_snapshot_ymmsl.assert_not_called() + + macro_snapshot = make_snapshot( + muscle_settings_in=[1], final_state_out=[0], o_i=[0], s=[0]) + for i in range(5): + snapshot_registry.register_snapshot(macro + i, macro_snapshot) + node = snapshot_registry._snapshots[macro + i][-1] + assert node.consistent_peers.keys() == {rr} + if micro_is_stateless and i == 4: + snapshot_registry._write_snapshot_ymmsl.assert_called_once() + snapshot_registry._write_snapshot_ymmsl.reset_mock() + else: + snapshot_registry._write_snapshot_ymmsl.assert_not_called() + + if not micro_is_stateless: + micro_snapshot = make_snapshot(f_i=[1], o_f=[0]) + for i in range(5): + snapshot_registry.register_snapshot(micro + i, micro_snapshot) + node = snapshot_registry._snapshots[micro + i][-1] + assert node.consistent_peers.keys() == {macro + i} + if i == 4: + snapshot_registry._write_snapshot_ymmsl.assert_called_once() + snapshot_registry._write_snapshot_ymmsl.reset_mock() + else: + snapshot_registry._write_snapshot_ymmsl.assert_not_called() + + qmc_snapshot = make_snapshot(parameters_out=[1, 1, 1, 1, 1], states_in=[]) + snapshot_registry.register_snapshot(qmc, qmc_snapshot) + node = snapshot_registry._snapshots[qmc][-1] + assert node.consistent_peers.keys() == {rr} + snapshot_registry._write_snapshot_ymmsl.assert_called_once() + snapshot_registry._write_snapshot_ymmsl.reset_mock() + assert len(snapshot_registry._snapshots[qmc]) == 1 # previous is cleaned up + + +def test_heuristic_rollbacks() -> None: + components = [Component(f'comp{i}', f'impl{i}') for i in range(4)] + conduits = [Conduit(f'comp{i}.o_f', f'comp{i+1}.f_i') for i in range(3)] + model = Model('linear', components, conduits) + implementations = [ + Implementation(f'impl{i}', supports_checkpoint=True, script='xyz') + for i in range(4)] + config = Configuration(model, implementations=implementations) + + comp1, comp2, comp3, comp4 = (Reference(f'comp{i}') for i in range(4)) + + snapshot_registry = SnapshotRegistry(config) + # prevent actually writing a ymmsl file, testing that separately + snapshot_registry._write_snapshot_ymmsl = MagicMock() + + for i in range(4): + snapshot_registry.register_snapshot(comp1, make_snapshot(o_f=[i])) + assert len(snapshot_registry._snapshots[comp1]) == 4 + + for i in range(10): + snapshot_registry.register_snapshot( + comp2, make_snapshot(f_i=[1], o_f=[0])) + snapshot_registry.register_snapshot( + comp3, make_snapshot(f_i=[1], o_f=[0])) + assert len(snapshot_registry._snapshots[comp2]) == 10 + assert len(snapshot_registry._snapshots[comp3]) == 10 + + snapshot_registry.register_snapshot(comp2, make_snapshot(f_i=[2], o_f=[1])) + assert len(snapshot_registry._snapshots[comp2]) == 11 + snapshot_registry.register_snapshot(comp2, make_snapshot(f_i=[3], o_f=[2])) + assert len(snapshot_registry._snapshots[comp2]) == 12 + + snapshot_registry._write_snapshot_ymmsl.assert_not_called() + + snapshot_registry.register_snapshot( + comp4, make_snapshot(f_i=[1])) + snapshot_registry._write_snapshot_ymmsl.assert_called() + + assert len(snapshot_registry._snapshots[comp1]) == 2 + assert len(snapshot_registry._snapshots[comp2]) == 2 + assert len(snapshot_registry._snapshots[comp3]) == 1 + assert len(snapshot_registry._snapshots[comp4]) == 1 From 13b9cee722011cf3d7e026c909d646b40c1f97c7 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 8 Sep 2022 16:17:01 +0200 Subject: [PATCH 038/183] Additional snapshot registry tests --- .../manager/test/test_snapshot_registry.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py index 7485b0a8..d9d83068 100644 --- a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py +++ b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py @@ -167,6 +167,35 @@ def test_connections(uq: Configuration) -> None: assert not (info & _ConnectionInfo.PEER_IS_VECTOR) +def test_multiplicity(uq: Configuration) -> None: + snapshot_registry = SnapshotRegistry(uq) + assert snapshot_registry._multiplicity(Reference('qmc')) == [] + assert snapshot_registry._multiplicity(Reference('rr')) == [] + assert snapshot_registry._multiplicity(Reference('macro')) == [5] + assert snapshot_registry._multiplicity(Reference('micro')) == [5] + + +def test_implementation(uq: Configuration) -> None: + snapshot_registry = SnapshotRegistry(uq) + + qmc_impl = snapshot_registry._implementation(Reference('qmc')) + assert qmc_impl.name == 'qmc_impl' + + missing_impl = snapshot_registry._implementation(Reference('missing')) + assert missing_impl is None + + +def test_stateful(uq: Configuration, micro_is_stateless: bool) -> None: + uq.implementations['macro_impl'].stateful = IState.WEAKLY_STATEFUL + snapshot_registry = SnapshotRegistry(uq) + + assert snapshot_registry._is_stateful(Reference('macro')) + stateful = snapshot_registry._is_stateful(Reference('micro')) + assert stateful is not micro_is_stateless + + assert snapshot_registry._is_stateful(Reference('unknown')) + + def test_macro_micro_snapshots( macro_micro: Configuration, micro_is_stateless: bool) -> None: snapshot_registry = SnapshotRegistry(macro_micro) From 1b8c97ad451b7cd3aa0f5fa2c77ca5999346b32e Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 9 Sep 2022 15:41:41 +0200 Subject: [PATCH 039/183] Add caching decorators and more docstrings. --- .../libmuscle/manager/snapshot_registry.py | 69 +++++++++++++++++-- 1 file changed, 65 insertions(+), 4 deletions(-) diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py index 1fd4b12d..49c5351b 100644 --- a/libmuscle/python/libmuscle/manager/snapshot_registry.py +++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py @@ -1,5 +1,6 @@ from dataclasses import dataclass, field from enum import Flag, auto +from functools import lru_cache from itertools import chain, zip_longest from operator import attrgetter from typing import Dict, Optional, Set, List, Tuple, TypeVar @@ -330,6 +331,11 @@ def _write_snapshot_ymmsl( def _cleanup_snapshots( self, selected_snapshots: List[SnapshotNode]) -> None: + """Remove all snapshots that are older than the selected snapshots. + + Args: + selected_snapshots: All snapshot nodes of a workflow snapshot + """ # remove all snapshots older than the selected ones removed_snapshots = set() # type: Set[SnapshotNode] for snapshot in selected_snapshots: @@ -349,8 +355,20 @@ def _cleanup_snapshots( peer_snapshot.consistent_peers[snapshot.instance].remove( snapshot) - # TODO: add caching decorator or move into an instance variable + @lru_cache(maxsize=None) def _get_stateful_peers(self, instance: Reference) -> Set[Reference]: + """Return the set of stateful peers for the given instance. + + Note: instance is assumed to contain the full index, not just the kernel + name. + + Args: + instance: Instance to get stateful peers of. See + :meth:`_is_stateful`. + + Returns: + Set with all stateful peer instances (including their index). + """ peers = set() # type: Set[Reference] kernel = instance.without_trailing_ints() index = [int(instance[i]) for i in range(len(kernel), len(instance))] @@ -376,9 +394,30 @@ def _get_stateful_peers(self, instance: Reference) -> Set[Reference]: peers.add(peer_kernel + index[:-1]) return peers - # TODO: add caching decorator or move into an instance variable + @lru_cache(maxsize=None) def _get_connections(self, instance: Reference, peer: Reference ) -> List[_ConnectionType]: + """Get the list of connections between instance and peer. + + Args: + instance: Instance reference (including index) + peer: Peer reference (including index) + + Returns: + A list of tuples describing all conduits between instance and peer: + instance_port (Reference): the port of instance that is + connected to + peer_port (Reference): the port on the peer instance + info (_ConnectionInfo): flag describing the connection. The + instance is sending when + ``info & _ConnectionInfo.SELF_IS_SENDING`` and receiving + otherwise. When the instance port is a vector port and the + peer port is a non-vector port, the flag + ``_ConnectionInfo.SELF_IS_VECTOR`` is set. In the reverse + situation the flag ``_ConnectionInfo.PEER_IS_VECTOR`` is + set. When both ports are vector or non-vector, neither flag + is set. + """ instance_kernel = instance.without_trailing_ints() peer_kernel = peer.without_trailing_ints() @@ -412,15 +451,26 @@ def _get_connections(self, instance: Reference, peer: Reference conn_type)) return connected_ports - # TODO: add caching decorator or move into an instance variable + @lru_cache(maxsize=None) def _multiplicity(self, kernel: Reference) -> List[int]: + """Return the multiplicity of a kernel + """ for component in self._configuration.model.components: if component.name == kernel: return component.multiplicity raise KeyError(str(kernel)) - # TODO: add caching decorator or move into an instance variable + @lru_cache(maxsize=None) def _implementation(self, kernel: Reference) -> Optional[Implementation]: + """Return the implementation of a kernel. + + Args: + kernel: The kernel to get the implementation for. + + Returns: + Implementation for the kernel, or None if not provided in the + configuration. + """ implementation = None for component in self._configuration.model.components: if component.name == kernel: @@ -429,7 +479,18 @@ def _implementation(self, kernel: Reference) -> Optional[Implementation]: return self._configuration.implementations[implementation] return None + @lru_cache(maxsize=None) def _is_stateful(self, kernel: Reference) -> bool: + """Check if a kernel has a stateful implementation. + + A kernel is considered stateful if: + - There is no Implementation given for the kernel + - Implementation.stateful = ImplementationState.STATEFUL + - Implementation.stateful = ImplementationState.WEAKLY_STATEFUL and the + implementation supports checkpointing. In this case we assume to get + snapshots from these kernels and we take them into account in the + snapshot graph. + """ implementation = self._implementation(kernel) if implementation is None: return True # assume stateful From e244bdf340d2f10d122877669670d733e84d4934 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 9 Sep 2022 15:42:44 +0200 Subject: [PATCH 040/183] Add logic for storing snapshot ymmsl --- .../libmuscle/manager/snapshot_registry.py | 78 +++++++++++++++++-- .../manager/test/test_snapshot_registry.py | 76 +++++++++++++++--- 2 files changed, 140 insertions(+), 14 deletions(-) diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py index 49c5351b..9fb53d69 100644 --- a/libmuscle/python/libmuscle/manager/snapshot_registry.py +++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py @@ -1,16 +1,21 @@ from dataclasses import dataclass, field +from datetime import datetime from enum import Flag, auto from functools import lru_cache from itertools import chain, zip_longest from operator import attrgetter +from pathlib import Path from typing import Dict, Optional, Set, List, Tuple, TypeVar -from ymmsl import Reference, Configuration, Identifier, Implementation -from ymmsl import ImplementationState as IState +from ymmsl import ( + Reference, Configuration, Identifier, Implementation, save, + PartialConfiguration, ImplementationState as IState) from libmuscle.snapshot import SnapshotMetadata +_MAX_FILE_EXISTS_CHECK = 100 + _SnapshotDictType = Dict[Reference, List["SnapshotNode"]] _ConnectionType = Tuple[Identifier, Identifier, "_ConnectionInfo"] _T = TypeVar("_T") @@ -162,13 +167,15 @@ class SnapshotRegistry: :meth:`register_snapshot`. """ - def __init__(self, configuration: Configuration) -> None: + def __init__( + self, configuration: Configuration, snapshot_folder: Path) -> None: """Create a snapshot graph using provided configuration. Args: configuration: ymmsl configuration describing the workflow. """ self._configuration = configuration + self._snapshot_folder = snapshot_folder self._snapshots = {} # type: _SnapshotDictType @@ -326,8 +333,69 @@ def num_allowed_snapshots(instance: Reference) -> int: return selected_snapshots def _write_snapshot_ymmsl( - self, selected_snapshot: List[SnapshotNode]) -> None: - ... + self, selected_snapshots: List[SnapshotNode]) -> None: + """Write the snapshot ymmsl file to the snapshot folder. + + Args: + selected_snapshots: All snapshot nodes of the workflow snapshot. + """ + now = datetime.now() + config = self._generate_snapshot_config(selected_snapshots, now) + time = now.strftime('%Y%m%d_%H%M%S') + for i in range(_MAX_FILE_EXISTS_CHECK): + if i: + snapshot_filename = f'snapshot_{time}_{i}.ymmsl' + else: + snapshot_filename = f'snapshot_{time}.ymmsl' + savepath = self._snapshot_folder / snapshot_filename + if not savepath.exists(): + save(config, savepath) + return + raise RuntimeError('Could not find an available filename for storing' + f' the next workflow snapshot: {savepath} already' + ' exists.') + + def _generate_snapshot_config( + self, selected_snapshots: List[SnapshotNode], now: datetime + ) -> PartialConfiguration: + """Generate ymmsl configuration for snapshot file + """ + selected_snapshots.sort(key=attrgetter('instance')) + resume = {} + for node in selected_snapshots: + resume[node.instance] = Path(node.snapshot.snapshot_filename) + description = self._generate_description(selected_snapshots, now) + return PartialConfiguration(resume=resume, description=description) + + def _generate_description( + self, selected_snapshots: List[SnapshotNode], now: datetime) -> str: + """Generate a human-readable description of the workflow snapshot. + """ + triggers = {} # type: Dict[str, List[str]] + component_info = [] + max_instance_len = len('Instance ') + for node in selected_snapshots: + for trigger in node.snapshot.triggers: + triggers.setdefault(trigger, []).append(str(node.instance)) + component_info.append(( + str(node.instance), + f'{node.snapshot.timestamp:<11.6g}', + f'{node.snapshot.wallclock_time:<11.6g}')) + max_instance_len = max(max_instance_len, len(str(node.instance))) + instance_with_padding = 'Instance'.ljust(max_instance_len) + component_table = [ + f'{instance_with_padding} t wallclock time', + f'{"-" * (max_instance_len + 27)}'] + component_table += [ + f'{name.ljust(max_instance_len)} {timestamp} {walltime}' + for name, timestamp, walltime in component_info] + return (f'Workflow snapshot for {self._configuration.model.name}' + f' taken on {now.strftime("%Y-%m-%d %H:%M:%S")}.\n' + 'Snapshot triggers:\n' + + '\n'.join(f'- {trigger} ({", ".join(triggers[trigger])})' + for trigger in sorted(triggers)) + + '\n\n' + + '\n'.join(component_table)) def _cleanup_snapshots( self, selected_snapshots: List[SnapshotNode]) -> None: diff --git a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py index d9d83068..a7f48656 100644 --- a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py +++ b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py @@ -1,3 +1,5 @@ +from datetime import datetime, timedelta +from pathlib import Path from unittest.mock import MagicMock import pytest @@ -7,7 +9,7 @@ ImplementationState as IState, Reference) from libmuscle.manager.snapshot_registry import ( - SnapshotRegistry, calc_consistency, calc_consistency_list, safe_get, + SnapshotNode, SnapshotRegistry, calc_consistency, calc_consistency_list, safe_get, _ConnectionInfo) @@ -97,8 +99,64 @@ def test_calc_consistency_list() -> None: assert calc_consistency_list(num_received, num_sent, False) +def test_write_ymmsl(tmp_path: Path): + snapshot_registry = SnapshotRegistry( + Configuration(Model('empty', [])), tmp_path) + snapshot_registry._write_snapshot_ymmsl([]) + + paths = list(tmp_path.iterdir()) + assert len(paths) == 1 + assert paths[0].suffix == ".ymmsl" + paths[0].unlink() + + now = datetime.now() + for seconds in range(3): + time = (now + timedelta(seconds=seconds)).strftime("%Y%m%d_%H%M%S") + (tmp_path / f'snapshot_{time}.ymmsl').touch() + snapshot_registry._write_snapshot_ymmsl([]) + paths = list(tmp_path.iterdir()) + assert len(paths) == 4 + paths = list(tmp_path.glob('*_1.ymmsl')) + assert len(paths) == 1 + + +def test_snapshot_config(): + snapshot_registry = SnapshotRegistry( + Configuration(Model('empty', [])), None) + micro_metadata = SnapshotMetadata( + ['simulation_time >= 24.0', 'wallclocktime >= 10'], + 10.123456789, 24.3456789, None, {}, False, 'micro_snapshot') + macro_metadata = SnapshotMetadata( + ['simulation_time >= 12.0', 'wallclocktime >= 10'], + 10.123456789, 12.3456789, None, {}, False, 'macro_snapshot') + snapshots = [ + SnapshotNode(1, Reference('micro'), micro_metadata, set()), + SnapshotNode(1, Reference('macro'), macro_metadata, set())] + + now = datetime.now() + config = snapshot_registry._generate_snapshot_config(snapshots, now) + assert len(config.resume) == 2 + assert config.resume[Reference('macro')] == Path('macro_snapshot') + assert config.resume[Reference('micro')] == Path('micro_snapshot') + # note: no automatic testing for formatting, should verify by eye if this + # looks okay.. + print(config.description) + + long_metadata = SnapshotMetadata( + ['simulation_time >= 24.0'], 1.23456789e-10, 1.23456789e10, None, + {}, False, '/this/is/a/long/path/to/the/snapshot/file.pack') + snapshots.append(SnapshotNode( + 1, Reference('this.is.a.long.reference[10]'), long_metadata, set())) + + config = snapshot_registry._generate_snapshot_config(snapshots, now) + assert len(config.resume) == 3 + assert config.resume[Reference('this.is.a.long.reference[10]')] == Path( + '/this/is/a/long/path/to/the/snapshot/file.pack') + print(config.description) + + def test_stateful_peers(uq: Configuration, micro_is_stateless: bool) -> None: - snapshot_registry = SnapshotRegistry(uq) + snapshot_registry = SnapshotRegistry(uq, None) macro = Reference('macro') micro = Reference('micro') qmc = Reference('qmc') @@ -119,7 +177,7 @@ def test_stateful_peers(uq: Configuration, micro_is_stateless: bool) -> None: def test_connections(uq: Configuration) -> None: - snapshot_registry = SnapshotRegistry(uq) + snapshot_registry = SnapshotRegistry(uq, None) macro = Reference('macro') micro = Reference('micro') qmc = Reference('qmc') @@ -168,7 +226,7 @@ def test_connections(uq: Configuration) -> None: def test_multiplicity(uq: Configuration) -> None: - snapshot_registry = SnapshotRegistry(uq) + snapshot_registry = SnapshotRegistry(uq, None) assert snapshot_registry._multiplicity(Reference('qmc')) == [] assert snapshot_registry._multiplicity(Reference('rr')) == [] assert snapshot_registry._multiplicity(Reference('macro')) == [5] @@ -176,7 +234,7 @@ def test_multiplicity(uq: Configuration) -> None: def test_implementation(uq: Configuration) -> None: - snapshot_registry = SnapshotRegistry(uq) + snapshot_registry = SnapshotRegistry(uq, None) qmc_impl = snapshot_registry._implementation(Reference('qmc')) assert qmc_impl.name == 'qmc_impl' @@ -187,7 +245,7 @@ def test_implementation(uq: Configuration) -> None: def test_stateful(uq: Configuration, micro_is_stateless: bool) -> None: uq.implementations['macro_impl'].stateful = IState.WEAKLY_STATEFUL - snapshot_registry = SnapshotRegistry(uq) + snapshot_registry = SnapshotRegistry(uq, None) assert snapshot_registry._is_stateful(Reference('macro')) stateful = snapshot_registry._is_stateful(Reference('micro')) @@ -198,7 +256,7 @@ def test_stateful(uq: Configuration, micro_is_stateless: bool) -> None: def test_macro_micro_snapshots( macro_micro: Configuration, micro_is_stateless: bool) -> None: - snapshot_registry = SnapshotRegistry(macro_micro) + snapshot_registry = SnapshotRegistry(macro_micro, None) # prevent actually writing a ymmsl file, testing that separately snapshot_registry._write_snapshot_ymmsl = MagicMock() macro = Reference('macro') @@ -261,7 +319,7 @@ def test_macro_micro_snapshots( def test_uq(uq: Configuration, micro_is_stateless: bool) -> None: - snapshot_registry = SnapshotRegistry(uq) + snapshot_registry = SnapshotRegistry(uq, None) # prevent actually writing a ymmsl file, testing that separately snapshot_registry._write_snapshot_ymmsl = MagicMock() macro = Reference('macro') @@ -326,7 +384,7 @@ def test_heuristic_rollbacks() -> None: comp1, comp2, comp3, comp4 = (Reference(f'comp{i}') for i in range(4)) - snapshot_registry = SnapshotRegistry(config) + snapshot_registry = SnapshotRegistry(config, None) # prevent actually writing a ymmsl file, testing that separately snapshot_registry._write_snapshot_ymmsl = MagicMock() From adfe4760885b3222474791f2193f0db0d5454993 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 13 Sep 2022 11:48:53 +0200 Subject: [PATCH 041/183] Implement submit_snapshot in MMP server --- libmuscle/python/libmuscle/manager/manager.py | 13 +++- .../python/libmuscle/manager/mmp_server.py | 28 ++++++++- libmuscle/python/libmuscle/manager/run_dir.py | 18 ++++++ .../libmuscle/manager/snapshot_registry.py | 26 +++++--- .../python/libmuscle/manager/test/conftest.py | 62 ++++++++++++------- .../manager/test/test_mmp_request_handler.py | 30 ++++++++- libmuscle/python/libmuscle/mcp/protocol.py | 1 + 7 files changed, 139 insertions(+), 39 deletions(-) diff --git a/libmuscle/python/libmuscle/manager/manager.py b/libmuscle/python/libmuscle/manager/manager.py index 21f21c60..14c7e15b 100644 --- a/libmuscle/python/libmuscle/manager/manager.py +++ b/libmuscle/python/libmuscle/manager/manager.py @@ -11,6 +11,7 @@ from libmuscle.manager.mmp_server import MMPServer from libmuscle.manager.instance_manager import InstanceManager from libmuscle.manager.run_dir import RunDir +from libmuscle.manager.snapshot_registry import SnapshotRegistry from libmuscle.manager.topology_store import TopologyStore @@ -42,6 +43,15 @@ def __init__( self._logger = Logger(log_dir, log_level) self._topology_store = TopologyStore(configuration) self._instance_registry = InstanceRegistry() + if run_dir is not None: + snapshot_dir = run_dir.snapshot_dir() + else: + snapshot_dir = Path.cwd() + if self._configuration.checkpoints: + _logger.warning('Checkpoints are configured but no run' + ' directory is provided. Snapshots will be' + ' stored in the current working directory.') + self._snapshot_registry = SnapshotRegistry(configuration, snapshot_dir) if self._run_dir: save_ymmsl( @@ -59,7 +69,8 @@ def __init__( self._server = MMPServer( self._logger, self._configuration, - self._instance_registry, self._topology_store) + self._instance_registry, self._topology_store, + self._snapshot_registry) if self._instance_manager: self._instance_manager.set_manager_location( diff --git a/libmuscle/python/libmuscle/manager/mmp_server.py b/libmuscle/python/libmuscle/manager/mmp_server.py index 793bbfc8..6f377362 100644 --- a/libmuscle/python/libmuscle/manager/mmp_server.py +++ b/libmuscle/python/libmuscle/manager/mmp_server.py @@ -12,10 +12,12 @@ from libmuscle.manager.instance_registry import ( AlreadyRegistered, InstanceRegistry) from libmuscle.manager.logger import Logger +from libmuscle.manager.snapshot_registry import SnapshotRegistry from libmuscle.manager.topology_store import TopologyStore from libmuscle.mcp.protocol import RequestType, ResponseType from libmuscle.mcp.tcp_transport_server import TcpTransportServer from libmuscle.mcp.transport_server import RequestHandler +from libmuscle.snapshot import SnapshotMetadata from libmuscle.timestamp import Timestamp from libmuscle.util import generate_indices, instance_indices @@ -55,7 +57,8 @@ def __init__( logger: Logger, configuration: PartialConfiguration, instance_registry: InstanceRegistry, - topology_store: TopologyStore): + topology_store: TopologyStore, + snapshot_registry: SnapshotRegistry): """Create an MMPRequestHandler. Args: @@ -68,6 +71,7 @@ def __init__( self._configuration = configuration self._instance_registry = instance_registry self._topology_store = topology_store + self._snapshot_registry = snapshot_registry self._reference_time = datetime.now(timezone.utc) self._reference_timestamp = self._reference_time.timestamp() @@ -95,6 +99,8 @@ def handle_request(self, request: bytes) -> bytes: response = self._submit_log_message(*req_args) elif req_type == RequestType.SUBMIT_PROFILE_EVENTS.value: response = self._submit_profile_events(*req_args) + elif req_type == RequestType.SUBMIT_SNAPSHOT.value: + response = self._submit_snapshot(*req_args) return cast(bytes, msgpack.packb(response, use_bin_type=True)) @@ -259,6 +265,20 @@ def _submit_profile_events(self, events: List[List[Any]]) -> Any: """ return [ResponseType.SUCCESS.value] + def _submit_snapshot( + self, instance_id: str, snapshot: Dict[str, Any]) -> Any: + """Handle a submit snapshot request. + + Returns: + A list containing the following values on success: + + status (ResponseType): SUCCESS + """ + snapshot_obj = SnapshotMetadata(**snapshot) + instance = Reference(instance_id) + self._snapshot_registry.register_snapshot(instance, snapshot_obj) + return [ResponseType.SUCCESS.value] + def _generate_peer_instances( self, instance: Reference) -> Generator[Reference, None, None]: """Generates the names of all peer instances of an instance. @@ -320,7 +340,8 @@ def __init__( logger: Logger, configuration: PartialConfiguration, instance_registry: InstanceRegistry, - topology_store: TopologyStore + topology_store: TopologyStore, + snapshot_registry: SnapshotRegistry ) -> None: """Create an MMPServer. @@ -338,7 +359,8 @@ def __init__( topology_store: To get peers and conduits from """ self._handler = MMPRequestHandler( - logger, configuration, instance_registry, topology_store) + logger, configuration, instance_registry, topology_store, + snapshot_registry) try: self._server = TcpTransportServer(self._handler, 9000) except OSError as e: diff --git a/libmuscle/python/libmuscle/manager/run_dir.py b/libmuscle/python/libmuscle/manager/run_dir.py index 8bb4b91e..c2a50ed9 100644 --- a/libmuscle/python/libmuscle/manager/run_dir.py +++ b/libmuscle/python/libmuscle/manager/run_dir.py @@ -1,4 +1,5 @@ from pathlib import Path +from typing import Optional from ymmsl import Reference @@ -20,6 +21,8 @@ class RunDir: .out .err work_dir/ + snapshots/ + snapshots/ """ def __init__(self, run_dir: Path) -> None: """Create a RunDir managing the given directory. @@ -57,3 +60,18 @@ def instance_dir(self, name: Reference) -> Path: make it. """ return self.path / 'instances' / str(name) + + def snapshot_dir(self, name: Optional[Reference] = None) -> Path: + """Return the snapshot directory for the workflow or for an instance. + + Args: + name: Name of the instance. May be None to get the workflow snapshot + directory. + + Returns: + The path to the snapshot directory + """ + if name is None: + return self.path / 'snapshots' + else: + return self.instance_dir(name) / 'snapshots' diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py index 9fb53d69..b7f34253 100644 --- a/libmuscle/python/libmuscle/manager/snapshot_registry.py +++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py @@ -8,7 +8,7 @@ from typing import Dict, Optional, Set, List, Tuple, TypeVar from ymmsl import ( - Reference, Configuration, Identifier, Implementation, save, + Reference, Model, Identifier, Implementation, save, PartialConfiguration, ImplementationState as IState) from libmuscle.snapshot import SnapshotMetadata @@ -168,20 +168,26 @@ class SnapshotRegistry: """ def __init__( - self, configuration: Configuration, snapshot_folder: Path) -> None: + self, config: PartialConfiguration, snapshot_folder: Path + ) -> None: """Create a snapshot graph using provided configuration. Args: - configuration: ymmsl configuration describing the workflow. + config: ymmsl configuration describing the workflow. """ - self._configuration = configuration + if config.model is None or not isinstance(config.model, Model): + raise ValueError('The yMMSL experiment description does not' + ' contain a (complete) model section, so there' + ' is nothing to run!') + self._configuration = config + self._model = config.model self._snapshot_folder = snapshot_folder self._snapshots = {} # type: _SnapshotDictType self._instances = set() # type: Set[Reference] self._stateful_instances = set() # type: Set[Reference] - for component in configuration.model.components: + for component in config.model.components: instances = set(component.instances()) self._instances.update(instances) if self._is_stateful(component.name): @@ -389,7 +395,7 @@ def _generate_description( component_table += [ f'{name.ljust(max_instance_len)} {timestamp} {walltime}' for name, timestamp, walltime in component_info] - return (f'Workflow snapshot for {self._configuration.model.name}' + return (f'Workflow snapshot for {self._model.name}' f' taken on {now.strftime("%Y-%m-%d %H:%M:%S")}.\n' 'Snapshot triggers:\n' + '\n'.join(f'- {trigger} ({", ".join(triggers[trigger])})' @@ -440,7 +446,7 @@ def _get_stateful_peers(self, instance: Reference) -> Set[Reference]: peers = set() # type: Set[Reference] kernel = instance.without_trailing_ints() index = [int(instance[i]) for i in range(len(kernel), len(instance))] - for conduit in self._configuration.model.conduits: + for conduit in self._model.conduits: if conduit.sending_component() == kernel: peer_kernel = conduit.receiving_component() elif conduit.receiving_component() == kernel: @@ -490,7 +496,7 @@ def _get_connections(self, instance: Reference, peer: Reference peer_kernel = peer.without_trailing_ints() connected_ports = [] # type: List[_ConnectionType] - for conduit in self._configuration.model.conduits: + for conduit in self._model.conduits: if (conduit.sending_component() == instance_kernel and conduit.receiving_component() == peer_kernel): conn_type = _ConnectionInfo.SELF_IS_SENDING @@ -523,7 +529,7 @@ def _get_connections(self, instance: Reference, peer: Reference def _multiplicity(self, kernel: Reference) -> List[int]: """Return the multiplicity of a kernel """ - for component in self._configuration.model.components: + for component in self._model.components: if component.name == kernel: return component.multiplicity raise KeyError(str(kernel)) @@ -540,7 +546,7 @@ def _implementation(self, kernel: Reference) -> Optional[Implementation]: configuration. """ implementation = None - for component in self._configuration.model.components: + for component in self._model.components: if component.name == kernel: implementation = component.implementation if implementation in self._configuration.implementations: diff --git a/libmuscle/python/libmuscle/manager/test/conftest.py b/libmuscle/python/libmuscle/manager/test/conftest.py index 433e23b1..e95f290c 100644 --- a/libmuscle/python/libmuscle/manager/test/conftest.py +++ b/libmuscle/python/libmuscle/manager/test/conftest.py @@ -1,12 +1,12 @@ from pathlib import Path import pytest -from ymmsl import (Component, Conduit, Configuration, Model, Reference, - PartialConfiguration) +from ymmsl import Component, Conduit, Configuration, Model, Reference from libmuscle.manager.instance_registry import InstanceRegistry from libmuscle.manager.logger import Logger from libmuscle.manager.mmp_server import MMPRequestHandler +from libmuscle.manager.snapshot_registry import SnapshotRegistry from libmuscle.manager.topology_store import TopologyStore @@ -19,17 +19,7 @@ def logger(tmpdir): @pytest.fixture def mmp_configuration(): - return PartialConfiguration() - - -@pytest.fixture -def instance_registry(): - return InstanceRegistry() - - -@pytest.fixture -def topology_store() -> TopologyStore: - config = Configuration( + return Configuration( Model( 'test_model', [ @@ -41,14 +31,29 @@ def topology_store() -> TopologyStore: Conduit('micro.out', 'macro.in') ])) - return TopologyStore(config) + +@pytest.fixture +def instance_registry(): + return InstanceRegistry() + + +@pytest.fixture +def topology_store(mmp_configuration) -> TopologyStore: + return TopologyStore(mmp_configuration) + + +@pytest.fixture +def snapshot_registry(mmp_configuration) -> SnapshotRegistry: + return SnapshotRegistry(mmp_configuration, None) @pytest.fixture def mmp_request_handler( - logger, mmp_configuration, instance_registry, topology_store): + logger, mmp_configuration, instance_registry, topology_store, + snapshot_registry): return MMPRequestHandler( - logger, mmp_configuration, instance_registry, topology_store) + logger, mmp_configuration, instance_registry, topology_store, + snapshot_registry) @pytest.fixture @@ -64,14 +69,16 @@ def loaded_instance_registry(instance_registry): @pytest.fixture def registered_mmp_request_handler( - logger, mmp_configuration, loaded_instance_registry, topology_store): + logger, mmp_configuration, loaded_instance_registry, topology_store, + snapshot_registry): return MMPRequestHandler( - logger, mmp_configuration, loaded_instance_registry, topology_store) + logger, mmp_configuration, loaded_instance_registry, topology_store, + snapshot_registry) @pytest.fixture -def topology_store2() -> TopologyStore: - config = Configuration( +def mmp_configuration2(): + return Configuration( Model( 'test_model', [ @@ -86,7 +93,15 @@ def topology_store2() -> TopologyStore: Conduit('meso.out', 'macro.in') ])) - return TopologyStore(config) + +@pytest.fixture +def topology_store2(mmp_configuration2) -> TopologyStore: + return TopologyStore(mmp_configuration2) + + +@pytest.fixture +def snapshot_registry2(mmp_configuration2) -> SnapshotRegistry: + return SnapshotRegistry(mmp_configuration2, None) @pytest.fixture @@ -110,7 +125,8 @@ def loaded_instance_registry2(): @pytest.fixture def registered_mmp_request_handler2( - logger, mmp_configuration, loaded_instance_registry2, topology_store2): + logger, mmp_configuration, loaded_instance_registry2, topology_store2, + snapshot_registry2): return MMPRequestHandler( logger, mmp_configuration, - loaded_instance_registry2, topology_store2) + loaded_instance_registry2, topology_store2, snapshot_registry2) diff --git a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py index 0d91c650..ac80dca2 100644 --- a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py +++ b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py @@ -1,5 +1,8 @@ +import dataclasses from datetime import datetime, timezone from pathlib import Path +from unittest.mock import MagicMock + import msgpack from ymmsl import ( Operator, Reference, Checkpoints, CheckpointRangeRule, CheckpointAtRule) @@ -7,12 +10,14 @@ from libmuscle.logging import LogLevel from libmuscle.manager.mmp_server import MMPRequestHandler from libmuscle.mcp.protocol import RequestType, ResponseType +from libmuscle.snapshot import SnapshotMetadata def test_create_servicer(logger, mmp_configuration, instance_registry, - topology_store): + topology_store, snapshot_registry): MMPRequestHandler( - logger, mmp_configuration, instance_registry, topology_store) + logger, mmp_configuration, instance_registry, topology_store, + snapshot_registry) def test_log_message(mmp_request_handler, caplog): @@ -267,3 +272,24 @@ def test_request_peers_unknown(registered_mmp_request_handler2): assert status == ResponseType.ERROR.value assert error_msg is not None assert 'does_not_exist' in error_msg + + +def test_submit_snapshot(registered_mmp_request_handler): + register_snapshot = MagicMock() + registered_mmp_request_handler._snapshot_registry.register_snapshot = \ + register_snapshot + + instance_id = 'micro[1][2]' + snapshot = SnapshotMetadata( + ['1', '2'], 1.234, 2.345, 3.456, + {'in': [1], 'out': [0]}, True, 'fname') + snapshot_dict = dataclasses.asdict(snapshot) + + request = [RequestType.SUBMIT_SNAPSHOT.value, instance_id, snapshot_dict] + encoded_request = msgpack.packb(request, use_bin_type=True) + + result = registered_mmp_request_handler.handle_request(encoded_request) + decoded_result = msgpack.unpackb(result, raw=False) + + assert decoded_result[0] == ResponseType.SUCCESS.value + register_snapshot.assert_called_once_with(Reference(instance_id), snapshot) diff --git a/libmuscle/python/libmuscle/mcp/protocol.py b/libmuscle/python/libmuscle/mcp/protocol.py index 1e79a11d..06d1c0da 100644 --- a/libmuscle/python/libmuscle/mcp/protocol.py +++ b/libmuscle/python/libmuscle/mcp/protocol.py @@ -20,6 +20,7 @@ class RequestType(Enum): GET_SETTINGS = 4 SUBMIT_LOG_MESSAGE = 5 SUBMIT_PROFILE_EVENTS = 6 + SUBMIT_SNAPSHOT = 7 # MUSCLE Peer Protocol GET_NEXT_MESSAGE = 21 From d4c19ddabc14ce09c5f9dea3a372a14f14b84019 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 13 Sep 2022 11:57:03 +0200 Subject: [PATCH 042/183] Implement submit_snapshot in MMP client --- libmuscle/python/libmuscle/mmp_client.py | 18 +++++++++++++++--- libmuscle/python/libmuscle/snapshot_manager.py | 2 +- .../libmuscle/test/test_snapshot_manager.py | 6 ++++-- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py index 6376aa20..6a3fe729 100644 --- a/libmuscle/python/libmuscle/mmp_client.py +++ b/libmuscle/python/libmuscle/mmp_client.py @@ -1,3 +1,4 @@ +import dataclasses from datetime import datetime, timezone from pathlib import Path from random import uniform @@ -135,9 +136,20 @@ def submit_profile_events(self, events: Iterable[ProfileEvent]) -> None: [encode_profile_event(e) for e in events]] self._call_manager(request) - def submit_snapshot_metadata(self, snapshot_metadata: SnapshotMetadata - ) -> None: - ... # TODO + def submit_snapshot_metadata( + self, name: Reference, snapshot_metadata: SnapshotMetadata + ) -> None: + """Send snapshot metadata to the manager. + + Args: + name: Name of the instance in the simulation. + snapshot_metadata: Snapshot metadata to supply to the manager. + """ + request = [ + RequestType.SUBMIT_SNAPSHOT.value, + str(name), + dataclasses.asdict(snapshot_metadata)] + self._call_manager(request) def get_settings(self) -> Settings: """Get the central settings from the manager. diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py index cd6f9959..10f2c9fc 100644 --- a/libmuscle/python/libmuscle/snapshot_manager.py +++ b/libmuscle/python/libmuscle/snapshot_manager.py @@ -154,7 +154,7 @@ def __save_snapshot(self, msg: Message, final: bool) -> None: path = self.__store_snapshot(snapshot) metadata = SnapshotMetadata.from_snapshot(snapshot, str(path)) - self._manager.submit_snapshot_metadata(metadata) + self._manager.submit_snapshot_metadata(self._instance_id, metadata) if self._trigger is not None: self._trigger.update_checkpoints( diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py index d7d386c9..f1c18ec8 100644 --- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py +++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py @@ -56,7 +56,8 @@ def test_save_load_checkpoint(tmp_path: Path) -> None: communicator.get_message_counts.assert_called_with() manager.submit_snapshot_metadata.assert_called() - metadata = manager.submit_snapshot_metadata.call_args[0][0] + instance, metadata = manager.submit_snapshot_metadata.call_args[0] + assert instance == instance_id assert isinstance(metadata, SnapshotMetadata) assert metadata.triggers assert metadata.wallclock_time > 0.0 @@ -86,7 +87,8 @@ def test_save_load_checkpoint(tmp_path: Path) -> None: assert snapshot_manager2.should_save_final_snapshot(0.6) snapshot_manager2.save_final_snapshot(Message(0.6, None, 'test data2')) - metadata = manager.submit_snapshot_metadata.call_args[0][0] + instance, metadata = manager.submit_snapshot_metadata.call_args[0] + assert instance == instance_id assert isinstance(metadata, SnapshotMetadata) assert metadata.triggers assert metadata.wallclock_time > 0.0 From 4fd2b530a4698aba9d5d8552b4871b8cf8201bb4 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 13 Sep 2022 15:36:45 +0200 Subject: [PATCH 043/183] Use builtin itertools.product for generate_indices --- libmuscle/python/libmuscle/util.py | 29 +++-------------------------- 1 file changed, 3 insertions(+), 26 deletions(-) diff --git a/libmuscle/python/libmuscle/util.py b/libmuscle/python/libmuscle/util.py index 66217c21..f34d8cd9 100644 --- a/libmuscle/python/libmuscle/util.py +++ b/libmuscle/python/libmuscle/util.py @@ -1,3 +1,4 @@ +import itertools from pathlib import Path import sys from typing import Generator, List, Optional, cast @@ -47,32 +48,8 @@ def generate_indices(dims: List[int]) -> Generator[List[int], None, None]: Yields: Lists of indices, one for each point in the block. """ - index = [0] * len(dims) - done = False - while not done: - yield index - done = increment_index(index, dims) - - -def increment_index(index: List[int], dims: List[int]) -> bool: - """Increments an index. - - Args: - index: The index to be incremented. - dims: The dimensions of the block this index is in. - - Returns: - True iff the index overflowed and is now all zeros again. - """ - cur = len(index) - 1 - index[cur] += 1 - while index[cur] == dims[cur]: - index[cur] = 0 - if cur == 0: - return True - cur -= 1 - index[cur] += 1 - return False + for index in itertools.product(*map(range, dims)): + yield list(index) def extract_log_file_location(filename: str) -> Optional[Path]: From 401a27594d570ad63bbb86ae0c26d137841e014a Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 13 Sep 2022 15:38:36 +0200 Subject: [PATCH 044/183] Refactor generate_peer_instances Now reused across mmp_server and snapshot_registry --- libmuscle/python/libmuscle/manager/manager.py | 3 +- .../python/libmuscle/manager/mmp_server.py | 31 ++----------- .../libmuscle/manager/snapshot_registry.py | 43 ++++--------------- .../python/libmuscle/manager/test/conftest.py | 8 ++-- .../manager/test/test_snapshot_registry.py | 32 ++++++-------- .../libmuscle/manager/topology_store.py | 28 ++++++++++++ 6 files changed, 59 insertions(+), 86 deletions(-) diff --git a/libmuscle/python/libmuscle/manager/manager.py b/libmuscle/python/libmuscle/manager/manager.py index 14c7e15b..c28e65fb 100644 --- a/libmuscle/python/libmuscle/manager/manager.py +++ b/libmuscle/python/libmuscle/manager/manager.py @@ -51,7 +51,8 @@ def __init__( _logger.warning('Checkpoints are configured but no run' ' directory is provided. Snapshots will be' ' stored in the current working directory.') - self._snapshot_registry = SnapshotRegistry(configuration, snapshot_dir) + self._snapshot_registry = SnapshotRegistry( + configuration, snapshot_dir, self._topology_store) if self._run_dir: save_ymmsl( diff --git a/libmuscle/python/libmuscle/manager/mmp_server.py b/libmuscle/python/libmuscle/manager/mmp_server.py index 6f377362..f5b8b692 100644 --- a/libmuscle/python/libmuscle/manager/mmp_server.py +++ b/libmuscle/python/libmuscle/manager/mmp_server.py @@ -1,7 +1,7 @@ from datetime import datetime, timezone import errno import logging -from typing import Any, Dict, Optional, Tuple, cast, Generator, List +from typing import Any, Dict, Optional, Tuple, cast, List import msgpack from ymmsl import ( @@ -19,7 +19,6 @@ from libmuscle.mcp.transport_server import RequestHandler from libmuscle.snapshot import SnapshotMetadata from libmuscle.timestamp import Timestamp -from libmuscle.util import generate_indices, instance_indices _logger = logging.getLogger(__name__) @@ -182,9 +181,10 @@ def _get_peers(self, instance_id: str) -> Any: # generate instances try: + peers = self._topology_store.get_peer_instances(instance) instance_locations = { str(peer): self._instance_registry.get_locations(peer) - for peer in self._generate_peer_instances(instance)} + for peer in peers} except KeyError as e: return [ ResponseType.PENDING.value, @@ -279,31 +279,6 @@ def _submit_snapshot( self._snapshot_registry.register_snapshot(instance, snapshot_obj) return [ResponseType.SUCCESS.value] - def _generate_peer_instances( - self, instance: Reference) -> Generator[Reference, None, None]: - """Generates the names of all peer instances of an instance. - - Args: - instance: The instance whose peers to generate. - - Yields: - All peer instance identifiers. - """ - component = instance.without_trailing_ints() - indices = instance_indices(instance) - dims = self._topology_store.kernel_dimensions[component] - all_peer_dims = self._topology_store.get_peer_dimensions(component) - for peer, peer_dims in all_peer_dims.items(): - base = peer - for i in range(min(len(dims), len(peer_dims))): - base += indices[i] - - if dims >= peer_dims: - yield base - else: - for peer_indices in generate_indices(peer_dims[len(dims):]): - yield base + peer_indices - def _get_checkpoint_info( self, instance: Reference diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py index b7f34253..6e7f1199 100644 --- a/libmuscle/python/libmuscle/manager/snapshot_registry.py +++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py @@ -6,6 +6,7 @@ from operator import attrgetter from pathlib import Path from typing import Dict, Optional, Set, List, Tuple, TypeVar +from libmuscle.manager.topology_store import TopologyStore from ymmsl import ( Reference, Model, Identifier, Implementation, save, @@ -168,8 +169,8 @@ class SnapshotRegistry: """ def __init__( - self, config: PartialConfiguration, snapshot_folder: Path - ) -> None: + self, config: PartialConfiguration, snapshot_folder: Path, + topology_store: TopologyStore) -> None: """Create a snapshot graph using provided configuration. Args: @@ -182,6 +183,7 @@ def __init__( self._configuration = config self._model = config.model self._snapshot_folder = snapshot_folder + self._topology_store = topology_store self._snapshots = {} # type: _SnapshotDictType @@ -443,30 +445,10 @@ def _get_stateful_peers(self, instance: Reference) -> Set[Reference]: Returns: Set with all stateful peer instances (including their index). """ - peers = set() # type: Set[Reference] - kernel = instance.without_trailing_ints() - index = [int(instance[i]) for i in range(len(kernel), len(instance))] - for conduit in self._model.conduits: - if conduit.sending_component() == kernel: - peer_kernel = conduit.receiving_component() - elif conduit.receiving_component() == kernel: - peer_kernel = conduit.sending_component() - else: - continue - if not self._is_stateful(peer_kernel): - continue - if len(index) == len(self._multiplicity(peer_kernel)): - # we must be sending to the peer with the same index as us - peers.add(peer_kernel + index) - elif len(index) + 1 == len(self._multiplicity(peer_kernel)): - # we are sending on a vector port, peer is receiving non-vector - # generate all peer indices - for i in range(self._multiplicity(peer_kernel)[-1]): - peers.add(peer_kernel + index + i) - elif len(index) - 1 == len(self._multiplicity(peer_kernel)): - # we are sending to a vector port, strip last of our indices - peers.add(peer_kernel + index[:-1]) - return peers + return set( + peer + for peer in self._topology_store.get_peer_instances(instance) + if self._is_stateful(peer.without_trailing_ints())) @lru_cache(maxsize=None) def _get_connections(self, instance: Reference, peer: Reference @@ -525,15 +507,6 @@ def _get_connections(self, instance: Reference, peer: Reference conn_type)) return connected_ports - @lru_cache(maxsize=None) - def _multiplicity(self, kernel: Reference) -> List[int]: - """Return the multiplicity of a kernel - """ - for component in self._model.components: - if component.name == kernel: - return component.multiplicity - raise KeyError(str(kernel)) - @lru_cache(maxsize=None) def _implementation(self, kernel: Reference) -> Optional[Implementation]: """Return the implementation of a kernel. diff --git a/libmuscle/python/libmuscle/manager/test/conftest.py b/libmuscle/python/libmuscle/manager/test/conftest.py index e95f290c..24772bda 100644 --- a/libmuscle/python/libmuscle/manager/test/conftest.py +++ b/libmuscle/python/libmuscle/manager/test/conftest.py @@ -43,8 +43,8 @@ def topology_store(mmp_configuration) -> TopologyStore: @pytest.fixture -def snapshot_registry(mmp_configuration) -> SnapshotRegistry: - return SnapshotRegistry(mmp_configuration, None) +def snapshot_registry(mmp_configuration, topology_store) -> SnapshotRegistry: + return SnapshotRegistry(mmp_configuration, None, topology_store) @pytest.fixture @@ -100,8 +100,8 @@ def topology_store2(mmp_configuration2) -> TopologyStore: @pytest.fixture -def snapshot_registry2(mmp_configuration2) -> SnapshotRegistry: - return SnapshotRegistry(mmp_configuration2, None) +def snapshot_registry2(mmp_configuration2, topology_store) -> SnapshotRegistry: + return SnapshotRegistry(mmp_configuration2, None, topology_store) @pytest.fixture diff --git a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py index a7f48656..cc713c6a 100644 --- a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py +++ b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py @@ -3,7 +3,6 @@ from unittest.mock import MagicMock import pytest -from libmuscle.snapshot import SnapshotMetadata from ymmsl import ( Configuration, Model, Component, Conduit, Implementation, ImplementationState as IState, Reference) @@ -11,6 +10,8 @@ from libmuscle.manager.snapshot_registry import ( SnapshotNode, SnapshotRegistry, calc_consistency, calc_consistency_list, safe_get, _ConnectionInfo) +from libmuscle.manager.topology_store import TopologyStore +from libmuscle.snapshot import SnapshotMetadata def make_snapshot(**msg_counts) -> SnapshotMetadata: @@ -100,8 +101,9 @@ def test_calc_consistency_list() -> None: def test_write_ymmsl(tmp_path: Path): + configuration = Configuration(Model('empty', [])) snapshot_registry = SnapshotRegistry( - Configuration(Model('empty', [])), tmp_path) + configuration, tmp_path, TopologyStore(configuration)) snapshot_registry._write_snapshot_ymmsl([]) paths = list(tmp_path.iterdir()) @@ -121,8 +123,9 @@ def test_write_ymmsl(tmp_path: Path): def test_snapshot_config(): + configuration = Configuration(Model('empty', [])) snapshot_registry = SnapshotRegistry( - Configuration(Model('empty', [])), None) + configuration, None, TopologyStore(configuration)) micro_metadata = SnapshotMetadata( ['simulation_time >= 24.0', 'wallclocktime >= 10'], 10.123456789, 24.3456789, None, {}, False, 'micro_snapshot') @@ -156,7 +159,7 @@ def test_snapshot_config(): def test_stateful_peers(uq: Configuration, micro_is_stateless: bool) -> None: - snapshot_registry = SnapshotRegistry(uq, None) + snapshot_registry = SnapshotRegistry(uq, None, TopologyStore(uq)) macro = Reference('macro') micro = Reference('micro') qmc = Reference('qmc') @@ -177,7 +180,7 @@ def test_stateful_peers(uq: Configuration, micro_is_stateless: bool) -> None: def test_connections(uq: Configuration) -> None: - snapshot_registry = SnapshotRegistry(uq, None) + snapshot_registry = SnapshotRegistry(uq, None, TopologyStore(uq)) macro = Reference('macro') micro = Reference('micro') qmc = Reference('qmc') @@ -225,16 +228,8 @@ def test_connections(uq: Configuration) -> None: assert not (info & _ConnectionInfo.PEER_IS_VECTOR) -def test_multiplicity(uq: Configuration) -> None: - snapshot_registry = SnapshotRegistry(uq, None) - assert snapshot_registry._multiplicity(Reference('qmc')) == [] - assert snapshot_registry._multiplicity(Reference('rr')) == [] - assert snapshot_registry._multiplicity(Reference('macro')) == [5] - assert snapshot_registry._multiplicity(Reference('micro')) == [5] - - def test_implementation(uq: Configuration) -> None: - snapshot_registry = SnapshotRegistry(uq, None) + snapshot_registry = SnapshotRegistry(uq, None, TopologyStore(uq)) qmc_impl = snapshot_registry._implementation(Reference('qmc')) assert qmc_impl.name == 'qmc_impl' @@ -245,7 +240,7 @@ def test_implementation(uq: Configuration) -> None: def test_stateful(uq: Configuration, micro_is_stateless: bool) -> None: uq.implementations['macro_impl'].stateful = IState.WEAKLY_STATEFUL - snapshot_registry = SnapshotRegistry(uq, None) + snapshot_registry = SnapshotRegistry(uq, None, TopologyStore(uq)) assert snapshot_registry._is_stateful(Reference('macro')) stateful = snapshot_registry._is_stateful(Reference('micro')) @@ -256,7 +251,8 @@ def test_stateful(uq: Configuration, micro_is_stateless: bool) -> None: def test_macro_micro_snapshots( macro_micro: Configuration, micro_is_stateless: bool) -> None: - snapshot_registry = SnapshotRegistry(macro_micro, None) + snapshot_registry = SnapshotRegistry( + macro_micro, None, TopologyStore(macro_micro)) # prevent actually writing a ymmsl file, testing that separately snapshot_registry._write_snapshot_ymmsl = MagicMock() macro = Reference('macro') @@ -319,7 +315,7 @@ def test_macro_micro_snapshots( def test_uq(uq: Configuration, micro_is_stateless: bool) -> None: - snapshot_registry = SnapshotRegistry(uq, None) + snapshot_registry = SnapshotRegistry(uq, None, TopologyStore(uq)) # prevent actually writing a ymmsl file, testing that separately snapshot_registry._write_snapshot_ymmsl = MagicMock() macro = Reference('macro') @@ -384,7 +380,7 @@ def test_heuristic_rollbacks() -> None: comp1, comp2, comp3, comp4 = (Reference(f'comp{i}') for i in range(4)) - snapshot_registry = SnapshotRegistry(config, None) + snapshot_registry = SnapshotRegistry(config, None, TopologyStore(config)) # prevent actually writing a ymmsl file, testing that separately snapshot_registry._write_snapshot_ymmsl = MagicMock() diff --git a/libmuscle/python/libmuscle/manager/topology_store.py b/libmuscle/python/libmuscle/manager/topology_store.py index c8e3f864..f3dd1fd3 100644 --- a/libmuscle/python/libmuscle/manager/topology_store.py +++ b/libmuscle/python/libmuscle/manager/topology_store.py @@ -1,4 +1,5 @@ from typing import Dict, List +from libmuscle.util import generate_indices, instance_indices from ymmsl import Conduit, PartialConfiguration, Model, Reference @@ -77,3 +78,30 @@ def get_peer_dimensions(self, kernel_name: Reference snd = conduit.sending_component() ret[snd] = self.kernel_dimensions[snd] return ret + + def get_peer_instances(self, instance: Reference) -> List[Reference]: + """Generates the names of all peer instances of an instance. + + Args: + instance: The instance whose peers to generate. + + Returns: + All peer instance identifiers. + """ + component = instance.without_trailing_ints() + indices = instance_indices(instance) + dims = self.kernel_dimensions[component] + all_peer_dims = self.get_peer_dimensions(component) + + peers = [] + for peer, peer_dims in all_peer_dims.items(): + base = peer + for i in range(min(len(dims), len(peer_dims))): + base += indices[i] + + if dims >= peer_dims: + peers.append(base) + else: + for peer_indices in generate_indices(peer_dims[len(dims):]): + peers.append(base + peer_indices) + return peers From e0d1c4a1edd90395958fd44c6d57384c83f1018f Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 13 Sep 2022 16:19:06 +0200 Subject: [PATCH 045/183] Placeholder: snapshot directory setting --- libmuscle/python/libmuscle/manager/manager.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/libmuscle/python/libmuscle/manager/manager.py b/libmuscle/python/libmuscle/manager/manager.py index c28e65fb..d25d1977 100644 --- a/libmuscle/python/libmuscle/manager/manager.py +++ b/libmuscle/python/libmuscle/manager/manager.py @@ -59,6 +59,11 @@ def __init__( self._configuration, self._run_dir.path / 'configuration.ymmsl') + # TODO: decide if this should be a setting or part of checkpoint_info + # TODO: separate folder per intance + self._configuration.settings.setdefault( + 'muscle_snapshot_directory', str(snapshot_dir)) + self._instance_manager = None # type: Optional[InstanceManager] try: configuration = self._configuration.as_configuration() From 0475d852dfe1e88af34a980a9adeceea0e4a473e Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Wed, 14 Sep 2022 12:50:55 +0200 Subject: [PATCH 046/183] Integration test for macro/micro snapshot & resume --- integration_test/test_snapshot_macro_micro.py | 163 ++++++++++++++++++ .../python/libmuscle/checkpoint_triggers.py | 1 + libmuscle/python/libmuscle/instance.py | 6 +- libmuscle/python/libmuscle/manager/run_dir.py | 6 +- 4 files changed, 172 insertions(+), 4 deletions(-) create mode 100644 integration_test/test_snapshot_macro_micro.py diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py new file mode 100644 index 00000000..bbfd42db --- /dev/null +++ b/integration_test/test_snapshot_macro_micro.py @@ -0,0 +1,163 @@ +import sys +import pytest +from ymmsl import Operator, load + +from libmuscle import Instance, Message +from libmuscle.manager.manager import Manager +from libmuscle.manager.run_dir import RunDir + + +def macro(): + instance = Instance({ + Operator.O_I: ['o_i'], + Operator.S: ['s']}) + + while instance.reuse_instance(): + t_cur = instance.get_setting('t0', 'float') + dt = instance.get_setting('dt', 'float') + t_max = instance.get_setting('t_max', 'float') + + if instance.resuming(): + msg = instance.load_snapshot() + t_cur = msg.timestamp + assert msg.next_timestamp == pytest.approx(t_cur + dt) + i = msg.data + assert i >= 0 + else: + i = 0 + + while t_cur + dt <= t_max: + t_next = t_cur + dt + + if instance.should_save_snapshot(t_cur, t_next): + instance.save_snapshot(Message(t_cur, t_next, i)) + + t_next = None if t_next + dt > t_max else t_next + instance.send('o_i', Message(t_cur, t_next, i)) + + msg = instance.receive('s') + assert msg.data == i + + i += 1 + t_cur += dt + + if instance.should_save_final_snapshot(t_cur): + instance.save_final_snapshot(Message(t_cur, None, i)) + + +def micro(): + instance = Instance({ + Operator.F_INIT: ['f_i'], + Operator.O_F: ['o_f']}) + + while instance.reuse_instance(): + dt = instance.get_setting('dt', 'float') + t_max = instance.get_setting('t_max', 'float') + + if instance.resuming(): + msg = instance.load_snapshot() + t_cur = msg.timestamp + i, t_stop = msg.data + else: + msg = instance.receive('f_i') + t_cur = msg.timestamp + i = msg.data + t_stop = t_cur + t_max + + while t_cur < t_stop: + t_next = t_cur + dt + + if instance.should_save_snapshot(t_cur, t_next): + instance.save_snapshot(Message(t_cur, t_next, [i, t_stop])) + + t_cur += dt + + if instance.should_save_final_snapshot(t_cur): + instance.save_final_snapshot(Message(t_cur, None, [i, t_stop])) + + instance.send('o_f', Message(t_cur, None, i)) + + +def test_snapshot_macro_micro(tmp_path): + ymmsl_text = f"""ymmsl_version: v0.1 +model: + name: test_snapshot + components: + macro: macro_implementation + micro: micro_implementation + conduits: + macro.o_i: micro.f_i + micro.o_f: macro.s +settings: + macro.t0: 0.12 + macro.dt: 0.17 + macro.t_max: 1.9 + micro.dt: 0.009 + micro.t_max: 0.1 + muscle_remote_log_level: DEBUG +implementations: + macro_implementation: + executable: {sys.executable} + args: + - {__file__} + - macro + supports_checkpoint: true + micro_implementation: + executable: {sys.executable} + args: + - {__file__} + - micro + supports_checkpoint: true +resources: + macro: + threads: 1 + micro: + threads: 1 +checkpoints: + simulation_time: + - every: 0.4""" + ymmsl_doc = load(ymmsl_text) + + run_dir1 = RunDir(tmp_path / 'run1') + manager = Manager(ymmsl_doc, run_dir1) + manager.start_instances() + assert manager.wait() + + # Note: sorted only works because we have fewer than 10 snapshots, otherwise + # _10 would be sorted right after _1 + macro_snapshots = sorted(run_dir1.snapshot_dir().glob('macro*')) + assert len(macro_snapshots) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final + micro_snapshots = sorted(run_dir1.snapshot_dir().glob('micro*')) + assert len(micro_snapshots) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final + snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl')) + snapshot_docs = list(map(load, snapshots_ymmsl)) + assert snapshot_docs[0].resume['macro'] == macro_snapshots[0] + assert snapshot_docs[0].resume['micro'] == micro_snapshots[0] + assert snapshot_docs[1].resume['macro'] == macro_snapshots[1] + assert snapshot_docs[1].resume['micro'] == micro_snapshots[0] + for i in range(2, 7): + assert snapshot_docs[i].resume['macro'] == macro_snapshots[i - 1] + assert snapshot_docs[i].resume['micro'] == micro_snapshots[i - 1] + + ymmsl_doc.update(snapshot_docs[4]) + del ymmsl_doc.settings['muscle_snapshot_directory'] + run_dir2 = RunDir(tmp_path / 'run2') + manager = Manager(ymmsl_doc, run_dir2) + manager.start_instances() + assert manager.wait() + + macro_snapshots = sorted(run_dir2.snapshot_dir().glob('macro*')) + assert len(macro_snapshots) == 2 # 1.6, final + micro_snapshots = sorted(run_dir2.snapshot_dir().glob('micro*')) + assert len(micro_snapshots) == 3 # 1.2, 1.6, final + snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl')) + assert len(snapshots_ymmsl) == 2 + + +if __name__ == "__main__": + if 'macro' in sys.argv: + macro() + elif 'micro' in sys.argv: + micro() + else: + raise RuntimeError('Specify macro or micro on the command line') diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py index 6e4d644e..88d47553 100644 --- a/libmuscle/python/libmuscle/checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/checkpoint_triggers.py @@ -202,6 +202,7 @@ def __init__(self, utc_reference: datetime, checkpoints: Checkpoints self._last_triggers = [] # type: List[str] self._first_reuse = True + self._max_f_init_next_timestamp = None # type: Optional[float] # These attributes are only used to check if implementations are # following the guidelines diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index f91bae55..8c358665 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -134,7 +134,9 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool: # TODO: _f_init_cache should be empty here, or the user didn't # receive something that was sent on the last go-around. # At least emit a warning. - self.__pre_receive_f_init(apply_overlay) + if not (self.resuming() and self._first_run): + # when resuming we skip receiving on f_init in the first run + self.__pre_receive_f_init(apply_overlay) self._set_local_log_level() self._set_remote_log_level() @@ -147,11 +149,11 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool: if f_init_not_connected and no_settings_in: do_reuse = self._first_run - self._first_run = False else: for message in self._f_init_cache.values(): if isinstance(message.data, ClosePort): do_reuse = False + self._first_run = False max_f_init_next_timestamp = max( (msg.next_timestamp diff --git a/libmuscle/python/libmuscle/manager/run_dir.py b/libmuscle/python/libmuscle/manager/run_dir.py index c2a50ed9..6a50c2fe 100644 --- a/libmuscle/python/libmuscle/manager/run_dir.py +++ b/libmuscle/python/libmuscle/manager/run_dir.py @@ -72,6 +72,8 @@ def snapshot_dir(self, name: Optional[Reference] = None) -> Path: The path to the snapshot directory """ if name is None: - return self.path / 'snapshots' + path = self.path / 'snapshots' else: - return self.instance_dir(name) / 'snapshots' + path = self.instance_dir(name) / 'snapshots' + path.mkdir(exist_ok=True) + return path From 869a23b6f42898a03110d235cd3731503bf20c05 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 15 Sep 2022 14:07:52 +0200 Subject: [PATCH 047/183] Make snapshot_registry threaded and thread-safe --- libmuscle/python/libmuscle/manager/manager.py | 10 ++++-- .../libmuscle/manager/snapshot_registry.py | 32 ++++++++++++++++++- .../manager/test/test_snapshot_registry.py | 32 +++++++++---------- 3 files changed, 55 insertions(+), 19 deletions(-) diff --git a/libmuscle/python/libmuscle/manager/manager.py b/libmuscle/python/libmuscle/manager/manager.py index d25d1977..d96842a7 100644 --- a/libmuscle/python/libmuscle/manager/manager.py +++ b/libmuscle/python/libmuscle/manager/manager.py @@ -51,8 +51,6 @@ def __init__( _logger.warning('Checkpoints are configured but no run' ' directory is provided. Snapshots will be' ' stored in the current working directory.') - self._snapshot_registry = SnapshotRegistry( - configuration, snapshot_dir, self._topology_store) if self._run_dir: save_ymmsl( @@ -73,6 +71,12 @@ def __init__( except ValueError: pass + # SnapshotRegistry creates a worker thread, must be created after + # instance_manager which forks the process + self._snapshot_registry = SnapshotRegistry( + configuration, snapshot_dir, self._topology_store) + self._snapshot_registry.start() + self._server = MMPServer( self._logger, self._configuration, self._instance_registry, self._topology_store, @@ -108,6 +112,8 @@ def stop(self) -> None: """Shuts down the manager.""" # self._server.stop() self._server.stop() + self._snapshot_registry.shutdown() + self._snapshot_registry.join() self._logger.close() def wait(self) -> bool: diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py index 6e7f1199..d43db295 100644 --- a/libmuscle/python/libmuscle/manager/snapshot_registry.py +++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py @@ -5,6 +5,8 @@ from itertools import chain, zip_longest from operator import attrgetter from pathlib import Path +from queue import Queue +from threading import Thread from typing import Dict, Optional, Set, List, Tuple, TypeVar from libmuscle.manager.topology_store import TopologyStore @@ -19,6 +21,7 @@ _SnapshotDictType = Dict[Reference, List["SnapshotNode"]] _ConnectionType = Tuple[Identifier, Identifier, "_ConnectionInfo"] +_QueueItemType = Optional[Tuple[Reference, SnapshotMetadata]] _T = TypeVar("_T") @@ -157,7 +160,7 @@ def do_consistency_check( return True -class SnapshotRegistry: +class SnapshotRegistry(Thread): """Registry of all snapshots taken by instances. Current snapshots are stored in a graph. Every node represents a snapshot @@ -176,6 +179,8 @@ def __init__( Args: config: ymmsl configuration describing the workflow. """ + super().__init__(name='SnapshotRegistry') + if config.model is None or not isinstance(config.model, Model): raise ValueError('The yMMSL experiment description does not' ' contain a (complete) model section, so there' @@ -185,6 +190,7 @@ def __init__( self._snapshot_folder = snapshot_folder self._topology_store = topology_store + self._queue = Queue() # type: Queue[_QueueItemType] self._snapshots = {} # type: _SnapshotDictType self._instances = set() # type: Set[Reference] @@ -199,6 +205,30 @@ def register_snapshot( self, instance: Reference, snapshot: SnapshotMetadata) -> None: """Register a new snapshot. + Args: + instance: The instance that created the snapshot + snapshot: Metadata describing the snapshot + """ + self._queue.put((instance, snapshot)) + + def run(self) -> None: + """Code executed in a separate thread + """ + while True: + item = self._queue.get() + if item is None: + return + self._add_snapshot(*item) + + def shutdown(self) -> None: + """Stop the snapshot registry thread + """ + self._queue.put(None) + + def _add_snapshot( + self, instance: Reference, snapshot: SnapshotMetadata) -> None: + """Register a new snapshot. + Args: instance: The instance that created the snapshot snapshot: Metadata describing the snapshot diff --git a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py index cc713c6a..71e3fb7c 100644 --- a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py +++ b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py @@ -259,7 +259,7 @@ def test_macro_micro_snapshots( micro = Reference('micro') macro_snapshot = make_snapshot(o_i=[3], s=[3]) - snapshot_registry.register_snapshot(macro, macro_snapshot) + snapshot_registry._add_snapshot(macro, macro_snapshot) assert len(snapshot_registry._snapshots[macro]) == 1 node = snapshot_registry._snapshots[macro][0] @@ -281,14 +281,14 @@ def test_macro_micro_snapshots( # the macro snapshot above. However, it's still useful for testing the # consistency algorithm micro_snapshot = make_snapshot(f_i=[2], o_f=[1]) - snapshot_registry.register_snapshot(micro, micro_snapshot) + snapshot_registry._add_snapshot(micro, micro_snapshot) assert len(snapshot_registry._snapshots[micro]) == 1 assert not snapshot_registry._snapshots[micro][0].consistent snapshot_registry._write_snapshot_ymmsl.assert_not_called() micro_snapshot = make_snapshot(f_i=[3], o_f=[2]) - snapshot_registry.register_snapshot(micro, micro_snapshot) + snapshot_registry._add_snapshot(micro, micro_snapshot) # micro snapshots should be cleaned up now! assert len(snapshot_registry._snapshots[micro]) == 1 @@ -299,7 +299,7 @@ def test_macro_micro_snapshots( snapshot_registry._write_snapshot_ymmsl.reset_mock() micro_snapshot = make_snapshot(f_i=[4], o_f=[3]) - snapshot_registry.register_snapshot(micro, micro_snapshot) + snapshot_registry._add_snapshot(micro, micro_snapshot) # micro snapshots should be cleaned up now! assert len(snapshot_registry._snapshots[micro]) == 1 @@ -310,7 +310,7 @@ def test_macro_micro_snapshots( snapshot_registry._write_snapshot_ymmsl.reset_mock() macro_snapshot = make_snapshot(o_i=[4], s=[4]) - snapshot_registry.register_snapshot(macro, macro_snapshot) + snapshot_registry._add_snapshot(macro, macro_snapshot) snapshot_registry._write_snapshot_ymmsl.assert_called_once() @@ -324,14 +324,14 @@ def test_uq(uq: Configuration, micro_is_stateless: bool) -> None: rr = Reference('rr') qmc_snapshot = make_snapshot(parameters_out=[], states_in=[]) - snapshot_registry.register_snapshot(qmc, qmc_snapshot) + snapshot_registry._add_snapshot(qmc, qmc_snapshot) rr_snapshot = make_snapshot( front_in=[1, 1, 1, 1, 1, 0, 0, 0, 0, 0], front_out=[0] * 10, back_out=[1, 1, 1, 1, 1], back_in=[0] * 5) - snapshot_registry.register_snapshot(rr, rr_snapshot) + snapshot_registry._add_snapshot(rr, rr_snapshot) node = snapshot_registry._snapshots[rr][-1] assert qmc in node.consistent_peers snapshot_registry._write_snapshot_ymmsl.assert_not_called() @@ -339,7 +339,7 @@ def test_uq(uq: Configuration, micro_is_stateless: bool) -> None: macro_snapshot = make_snapshot( muscle_settings_in=[1], final_state_out=[0], o_i=[0], s=[0]) for i in range(5): - snapshot_registry.register_snapshot(macro + i, macro_snapshot) + snapshot_registry._add_snapshot(macro + i, macro_snapshot) node = snapshot_registry._snapshots[macro + i][-1] assert node.consistent_peers.keys() == {rr} if micro_is_stateless and i == 4: @@ -351,7 +351,7 @@ def test_uq(uq: Configuration, micro_is_stateless: bool) -> None: if not micro_is_stateless: micro_snapshot = make_snapshot(f_i=[1], o_f=[0]) for i in range(5): - snapshot_registry.register_snapshot(micro + i, micro_snapshot) + snapshot_registry._add_snapshot(micro + i, micro_snapshot) node = snapshot_registry._snapshots[micro + i][-1] assert node.consistent_peers.keys() == {macro + i} if i == 4: @@ -361,7 +361,7 @@ def test_uq(uq: Configuration, micro_is_stateless: bool) -> None: snapshot_registry._write_snapshot_ymmsl.assert_not_called() qmc_snapshot = make_snapshot(parameters_out=[1, 1, 1, 1, 1], states_in=[]) - snapshot_registry.register_snapshot(qmc, qmc_snapshot) + snapshot_registry._add_snapshot(qmc, qmc_snapshot) node = snapshot_registry._snapshots[qmc][-1] assert node.consistent_peers.keys() == {rr} snapshot_registry._write_snapshot_ymmsl.assert_called_once() @@ -385,25 +385,25 @@ def test_heuristic_rollbacks() -> None: snapshot_registry._write_snapshot_ymmsl = MagicMock() for i in range(4): - snapshot_registry.register_snapshot(comp1, make_snapshot(o_f=[i])) + snapshot_registry._add_snapshot(comp1, make_snapshot(o_f=[i])) assert len(snapshot_registry._snapshots[comp1]) == 4 for i in range(10): - snapshot_registry.register_snapshot( + snapshot_registry._add_snapshot( comp2, make_snapshot(f_i=[1], o_f=[0])) - snapshot_registry.register_snapshot( + snapshot_registry._add_snapshot( comp3, make_snapshot(f_i=[1], o_f=[0])) assert len(snapshot_registry._snapshots[comp2]) == 10 assert len(snapshot_registry._snapshots[comp3]) == 10 - snapshot_registry.register_snapshot(comp2, make_snapshot(f_i=[2], o_f=[1])) + snapshot_registry._add_snapshot(comp2, make_snapshot(f_i=[2], o_f=[1])) assert len(snapshot_registry._snapshots[comp2]) == 11 - snapshot_registry.register_snapshot(comp2, make_snapshot(f_i=[3], o_f=[2])) + snapshot_registry._add_snapshot(comp2, make_snapshot(f_i=[3], o_f=[2])) assert len(snapshot_registry._snapshots[comp2]) == 12 snapshot_registry._write_snapshot_ymmsl.assert_not_called() - snapshot_registry.register_snapshot( + snapshot_registry._add_snapshot( comp4, make_snapshot(f_i=[1])) snapshot_registry._write_snapshot_ymmsl.assert_called() From d2c6a141d0fbb4b86b42d570c7c9cd8908ead392 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 15 Sep 2022 16:33:16 +0200 Subject: [PATCH 048/183] Snapshot/resume test with multiplicity --- integration_test/test_snapshot_macro_micro.py | 107 ++++++++++++++++-- .../libmuscle/manager/snapshot_registry.py | 2 +- 2 files changed, 99 insertions(+), 10 deletions(-) diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py index bbfd42db..93427098 100644 --- a/integration_test/test_snapshot_macro_micro.py +++ b/integration_test/test_snapshot_macro_micro.py @@ -1,4 +1,5 @@ import sys + import pytest from ymmsl import Operator, load @@ -7,6 +8,9 @@ from libmuscle.manager.run_dir import RunDir +_LOG_LEVEL = 'INFO' # set to DEBUG for additional debug info + + def macro(): instance = Instance({ Operator.O_I: ['o_i'], @@ -45,6 +49,46 @@ def macro(): instance.save_final_snapshot(Message(t_cur, None, i)) +def macro_vector(): + instance = Instance({ + Operator.O_I: ['o_i[]'], + Operator.S: ['s[]']}) + + while instance.reuse_instance(): + t_cur = instance.get_setting('t0', 'float') + dt = instance.get_setting('dt', 'float') + t_max = instance.get_setting('t_max', 'float') + + if instance.resuming(): + msg = instance.load_snapshot() + t_cur = msg.timestamp + assert msg.next_timestamp == pytest.approx(t_cur + dt) + i = msg.data + assert i >= 0 + else: + i = 0 + + while t_cur + dt <= t_max: + t_next = t_cur + dt + + if instance.should_save_snapshot(t_cur, t_next): + instance.save_snapshot(Message(t_cur, t_next, i)) + + t_next = None if t_next + dt > t_max else t_next + for slot in range(instance.get_port_length('o_i')): + instance.send('o_i', Message(t_cur, t_next, i), slot) + + for slot in range(instance.get_port_length('s')): + msg = instance.receive('s', slot) + assert msg.data == i + + i += 1 + t_cur += dt + + if instance.should_save_final_snapshot(t_cur): + instance.save_final_snapshot(Message(t_cur, None, i)) + + def micro(): instance = Instance({ Operator.F_INIT: ['f_i'], @@ -78,8 +122,9 @@ def micro(): instance.send('o_f', Message(t_cur, None, i)) -def test_snapshot_macro_micro(tmp_path): - ymmsl_text = f"""ymmsl_version: v0.1 +@pytest.fixture +def base_config(): + return load(f"""ymmsl_version: v0.1 model: name: test_snapshot components: @@ -94,7 +139,7 @@ def test_snapshot_macro_micro(tmp_path): macro.t_max: 1.9 micro.dt: 0.009 micro.t_max: 0.1 - muscle_remote_log_level: DEBUG + muscle_remote_log_level: {_LOG_LEVEL} implementations: macro_implementation: executable: {sys.executable} @@ -115,11 +160,13 @@ def test_snapshot_macro_micro(tmp_path): threads: 1 checkpoints: simulation_time: - - every: 0.4""" - ymmsl_doc = load(ymmsl_text) + - every: 0.4""") + +def test_snapshot_macro_micro(tmp_path, base_config): + base_config.check_consistent() run_dir1 = RunDir(tmp_path / 'run1') - manager = Manager(ymmsl_doc, run_dir1) + manager = Manager(base_config, run_dir1, _LOG_LEVEL) manager.start_instances() assert manager.wait() @@ -139,10 +186,12 @@ def test_snapshot_macro_micro(tmp_path): assert snapshot_docs[i].resume['macro'] == macro_snapshots[i - 1] assert snapshot_docs[i].resume['micro'] == micro_snapshots[i - 1] - ymmsl_doc.update(snapshot_docs[4]) - del ymmsl_doc.settings['muscle_snapshot_directory'] + base_config.update(snapshot_docs[4]) + del base_config.settings['muscle_snapshot_directory'] + base_config.check_consistent() + run_dir2 = RunDir(tmp_path / 'run2') - manager = Manager(ymmsl_doc, run_dir2) + manager = Manager(base_config, run_dir2, _LOG_LEVEL) manager.start_instances() assert manager.wait() @@ -154,9 +203,49 @@ def test_snapshot_macro_micro(tmp_path): assert len(snapshots_ymmsl) == 2 +def test_snapshot_macro_vector_micro(tmp_path, base_config): + macro_implementation = base_config.implementations['macro_implementation'] + macro_implementation.args[-1] = 'macro_vector' + base_config.model.components[1].multiplicity = [2] + base_config.check_consistent() + + run_dir1 = RunDir(tmp_path / 'run1') + manager = Manager(base_config, run_dir1, _LOG_LEVEL) + manager.start_instances() + assert manager.wait() + + macro_snapshots = sorted(run_dir1.snapshot_dir().glob('macro*')) + assert len(macro_snapshots) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final + micro_snapshots = sorted(run_dir1.snapshot_dir().glob('micro*')) + assert len(micro_snapshots) == 6 * 2 # 0, 0.4, 0.8, 1.2, 1.6, final + snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl')) + # iff micro[0] snapshots before micro[1] at t==0.4, an additional workflow + # snapshot can be created + assert len(snapshots_ymmsl) in (7, 8) + + snapshot_docs = list(map(load, sorted(snapshots_ymmsl))) + base_config.update(snapshot_docs[-3]) + del base_config.settings['muscle_snapshot_directory'] + base_config.check_consistent() + + run_dir2 = RunDir(tmp_path / 'run2') + manager = Manager(base_config, run_dir2, _LOG_LEVEL) + manager.start_instances() + assert manager.wait() + + macro_snapshots = sorted(run_dir2.snapshot_dir().glob('macro*')) + assert len(macro_snapshots) == 2 # 1.6, final + micro_snapshots = sorted(run_dir2.snapshot_dir().glob('micro*')) + assert len(micro_snapshots) == 3 * 2 # 1.2, 1.6, final + snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl')) + assert len(snapshots_ymmsl) == 2 + + if __name__ == "__main__": if 'macro' in sys.argv: macro() + elif 'macro_vector' in sys.argv: + macro_vector() elif 'micro' in sys.argv: micro() else: diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py index d43db295..ed1618e3 100644 --- a/libmuscle/python/libmuscle/manager/snapshot_registry.py +++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py @@ -8,12 +8,12 @@ from queue import Queue from threading import Thread from typing import Dict, Optional, Set, List, Tuple, TypeVar -from libmuscle.manager.topology_store import TopologyStore from ymmsl import ( Reference, Model, Identifier, Implementation, save, PartialConfiguration, ImplementationState as IState) +from libmuscle.manager.topology_store import TopologyStore from libmuscle.snapshot import SnapshotMetadata From 11bb7f817cc3569dde7a85338d4ccd948c17c2dd Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 16 Sep 2022 11:41:42 +0200 Subject: [PATCH 049/183] Remove no-longer-relevant TODO comments --- libmuscle/python/libmuscle/communicator.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py index 7ffff004..d5ddfc39 100644 --- a/libmuscle/python/libmuscle/communicator.py +++ b/libmuscle/python/libmuscle/communicator.py @@ -79,7 +79,6 @@ def __init__(self, kernel: Reference, index: List[int], profiler: The profiler to use for recording sends and receives. """ - # TODO: pass a SnapshotManager and store as self._snapshot_manager self._kernel = kernel self._index = index self._declared_ports = declared_ports @@ -418,8 +417,6 @@ def __ports_from_declared(self) -> Dict[str, Port]: ports[port_name] = Port( port_name, operator, is_vector, is_connected, len(self._index), port_peer_dims) - # TODO: retrieve num_messages[] for this port from - # self._snapshot_manager when resuming return ports def __ports_from_conduits(self, conduits: List[Conduit] @@ -451,8 +448,6 @@ def __ports_from_conduits(self, conduits: List[Conduit] ports[str(port_id)] = Port( str(port_id), operator, is_vector, is_connected, len(self._index), port_peer_dims) - # TODO: retrieve num_messages[] for this port from - # self._snapshot_manager when resuming return ports def __settings_in_port(self, conduits: List[Conduit]) -> Port: @@ -472,8 +467,6 @@ def __settings_in_port(self, conduits: List[Conduit]) -> Port: conduit.sending_component())) return Port('muscle_settings_in', Operator.F_INIT, False, False, len(self._index), []) - # TODO: retrieve num_messages[] for this port from - # self._snapshot_manager when resuming def __get_client(self, instance: Reference) -> MPPClient: """Get or create a client to connect to the given instance. From 58dc980981e7911f7af3a4dd2c115682d5b85ef5 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 16 Sep 2022 15:53:16 +0200 Subject: [PATCH 050/183] Remove redundant cast --- libmuscle/python/libmuscle/instance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index 8c358665..b74507ed 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -165,7 +165,7 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool: # enabled, it might not exist and a KeyError is raised. try: snapshot_dir = self.get_setting('muscle_snapshot_directory', 'str') - snapshot_path = Path(cast(str, snapshot_dir)) + snapshot_path = Path(snapshot_dir) except KeyError: snapshot_path = None self._snapshot_manager.reuse_instance( From afb25c669aff0080cc36ab73c034399920930567 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 11 Oct 2022 16:52:53 +0200 Subject: [PATCH 051/183] Implement broadcasting (Python) Allow multiple conduits connected to a single output port. The only limitation is that the multiplicity of all connected peers must be the same (to avoid interference with vector port functionality). --- integration_test/test_broadcast.py | 44 +++++++++++++++ libmuscle/python/libmuscle/communicator.py | 53 +++++++++++++------ libmuscle/python/libmuscle/peer_manager.py | 44 +++++++++------ .../libmuscle/test/test_communicator.py | 43 +++++++-------- 4 files changed, 130 insertions(+), 54 deletions(-) create mode 100644 integration_test/test_broadcast.py diff --git a/integration_test/test_broadcast.py b/integration_test/test_broadcast.py new file mode 100644 index 00000000..0877cd8f --- /dev/null +++ b/integration_test/test_broadcast.py @@ -0,0 +1,44 @@ +from ymmsl import (Component, Conduit, Configuration, Operator, Model, + Settings) + +from libmuscle import Instance, Message +from libmuscle.runner import run_simulation + + +def broadcaster(): + instance = Instance({Operator.O_F: ['out']}) + + while instance.reuse_instance(): + # o_f + message = Message(0.0, None, 'testing') + instance.send('out', message) + + +def receiver(): + instance = Instance({Operator.F_INIT: ['in']}) + + while instance.reuse_instance(): + # f_init + msg = instance.receive('in') + assert msg.data == 'testing' + + +def test_broadcast(log_file_in_tmpdir): + elements = [ + Component('broadcast', 'broadcaster'), + Component('first', 'receiver'), + Component('second', 'receiver')] + + conduits = [ + Conduit('broadcast.out', 'first.in'), + Conduit('broadcast.out', 'second.in')] + + model = Model('test_model', elements, conduits) + settings = Settings() + + configuration = Configuration(model, settings) + + implementations = { + 'broadcaster': broadcaster, + 'receiver': receiver} + run_simulation(configuration, implementations) diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py index efdcb30d..fc24a9a8 100644 --- a/libmuscle/python/libmuscle/communicator.py +++ b/libmuscle/python/libmuscle/communicator.py @@ -206,27 +206,30 @@ def send_message( return port = self._ports[port_name] - profile_event = self._profiler.start(ProfileEventType.SEND, port, - None, slot, None) - recv_endpoint = self._peer_manager.get_peer_endpoint( + recv_endpoints = self._peer_manager.get_peer_endpoints( snd_endpoint.port, slot_list) port_length = None if self._ports[port_name].is_resizable(): port_length = self._ports[port_name].get_length() - mcp_message = MPPMessage(snd_endpoint.ref(), recv_endpoint.ref(), - port_length, - message.timestamp, message.next_timestamp, - cast(Settings, message.settings), - message.data) - encoded_message = mcp_message.encoded() - self._post_office.deposit(recv_endpoint.ref(), encoded_message) - profile_event.stop() - if port.is_vector(): - profile_event.port_length = port.get_length() - profile_event.message_size = len(encoded_message) + for recv_endpoint in recv_endpoints: + profile_event = self._profiler.start(ProfileEventType.SEND, port, + None, slot, None) + + mcp_message = MPPMessage(snd_endpoint.ref(), recv_endpoint.ref(), + port_length, + message.timestamp, message.next_timestamp, + cast(Settings, message.settings), + message.data) + encoded_message = mcp_message.encoded() + self._post_office.deposit(recv_endpoint.ref(), encoded_message) + + profile_event.stop() + if port.is_vector(): + profile_event.port_length = port.get_length() + profile_event.message_size = len(encoded_message) def receive_message(self, port_name: str, slot: Optional[int] = None, default: Optional[Message] = None @@ -289,8 +292,10 @@ def receive_message(self, port_name: str, slot: Optional[int] = None, profile_event = self._profiler.start(ProfileEventType.RECEIVE, port, None, slot, None) - snd_endpoint = self._peer_manager.get_peer_endpoint( - recv_endpoint.port, slot_list) + # peer_manager already checks that there is at most one snd_endpoint + # connected to the port we receive on + snd_endpoint = self._peer_manager.get_peer_endpoints( + recv_endpoint.port, slot_list)[0] client = self.__get_client(snd_endpoint.instance()) mcp_message_bytes = client.receive(recv_endpoint.ref()) mcp_message = MPPMessage.from_bytes(mcp_message_bytes) @@ -372,9 +377,23 @@ def __ports_from_declared(self) -> Dict[str, Port]: port_id = Identifier(port_name) is_connected = self._peer_manager.is_connected(port_id) if is_connected: - peer_port = self._peer_manager.get_peer_port(port_id) + peer_ports = self._peer_manager.get_peer_ports(port_id) + peer_port = peer_ports[0] peer_ce = peer_port[:-1] port_peer_dims = self._peer_manager.get_peer_dims(peer_ce) + for peer_port in peer_ports[1:]: + peer_ce = peer_port[:-1] + if port_peer_dims != self._peer_manager.get_peer_dims( + peer_ce): + port_strs = ', '.join(map(str, peer_ports)) + raise RuntimeError(('Broadcast port "{}" is' + ' connected to peers with' + ' different dimensions. All' + ' peer components that this' + ' port is connected to must' + ' have the same multiplicity.' + ' Connected ports: {}.' + ).format(port_name, port_strs)) else: port_peer_dims = [] ports[port_name] = Port( diff --git a/libmuscle/python/libmuscle/peer_manager.py b/libmuscle/python/libmuscle/peer_manager.py index 5b8728c1..0a7600c0 100644 --- a/libmuscle/python/libmuscle/peer_manager.py +++ b/libmuscle/python/libmuscle/peer_manager.py @@ -34,15 +34,21 @@ def __init__(self, kernel: Reference, index: List[int], self.__index = index # peer port ids, indexed by local kernel.port id - self.__peers = dict() # type: Dict[Reference, Reference] + self.__peers = dict() # type: Dict[Reference, List[Reference]] for conduit in conduits: if str(conduit.sending_component()) == str(kernel): # we send on the port this conduit attaches to - self.__peers[conduit.sender] = conduit.receiver + self.__peers.setdefault( + conduit.sender, []).append(conduit.receiver) if str(conduit.receiving_component()) == str(kernel): # we receive on the port this conduit attaches to - self.__peers[conduit.receiver] = conduit.sender + if conduit.receiver in self.__peers: + raise RuntimeError(('Receiving port "{}" is connected by' + ' multiple conduits, but at most one' + ' is allowed.' + ).format(conduit.receiving_port())) + self.__peers[conduit.receiver] = [conduit.sender] self.__peer_dims = peer_dims # indexed by kernel id self.__peer_locations = peer_locations # indexed by instance id @@ -56,8 +62,8 @@ def is_connected(self, port: Identifier) -> bool: recv_port_full = self.__kernel + port return recv_port_full in self.__peers - def get_peer_port(self, port: Identifier) -> Reference: - """Get a reference for the peer port. + def get_peer_ports(self, port: Identifier) -> List[Reference]: + """Get a reference for the peer ports. Args: port: Name of the port on this side. @@ -83,8 +89,8 @@ def get_peer_locations(self, peer_instance: Reference) -> List[str]: """ return self.__peer_locations[peer_instance] - def get_peer_endpoint(self, port: Identifier, slot: List[int] - ) -> Endpoint: + def get_peer_endpoints(self, port: Identifier, slot: List[int] + ) -> List[Endpoint]: """Determine the peer endpoint for the given port and slot. Args: @@ -94,14 +100,20 @@ def get_peer_endpoint(self, port: Identifier, slot: List[int] Returns: The peer endpoint. """ - peer = self.__peers[self.__kernel + port] - peer_kernel = peer[:-1] - peer_port = cast(Identifier, peer[-1]) + peers = self.__peers[self.__kernel + port] + endpoints = [] - total_index = self.__index + slot + for peer in peers: + peer_kernel = peer[:-1] + peer_port = cast(Identifier, peer[-1]) - # rebalance the indices - peer_dim = len(self.__peer_dims[peer_kernel]) - peer_index = total_index[0:peer_dim] - peer_slot = total_index[peer_dim:] - return Endpoint(peer_kernel, peer_index, peer_port, peer_slot) + total_index = self.__index + slot + + # rebalance the indices + peer_dim = len(self.__peer_dims[peer_kernel]) + peer_index = total_index[0:peer_dim] + peer_slot = total_index[peer_dim:] + endpoints.append( + Endpoint(peer_kernel, peer_index, peer_port, peer_slot)) + + return endpoints diff --git a/libmuscle/python/libmuscle/test/test_communicator.py b/libmuscle/python/libmuscle/test/test_communicator.py index 8f0f1238..e2414856 100644 --- a/libmuscle/python/libmuscle/test/test_communicator.py +++ b/libmuscle/python/libmuscle/test/test_communicator.py @@ -1,3 +1,4 @@ +from typing import List from libmuscle.communicator import Communicator, Endpoint, Message from libmuscle.mpp_message import ClosePort, MPPMessage from libmuscle.port import Port @@ -43,26 +44,26 @@ def communicator() -> Communicator: pm = communicator._peer_manager pm.is_connected.return_value = True - def gpp(x) -> Reference: + def gpp(x) -> List[Reference]: if 'out' in str(x): - return Reference('in') - return Reference('out') + return [Reference('in')] + return [Reference('out')] - pm.get_peer_port = gpp + pm.get_peer_ports = gpp pm.get_peer_dims.return_value = [] pm.get_peer_locations.return_value = ['direct:test'] - def gpe(p, s) -> Reference: + def gpe(p, s) -> List[Reference]: endpoint = MagicMock() endpoint.instance.return_value = Reference('other') if 'out' in str(p): endpoint.ref.return_value = Reference('other.in[13]') else: endpoint.ref.return_value = Reference('other.out') - return endpoint + return [endpoint] - pm.get_peer_endpoint = gpe + pm.get_peer_endpoints = gpe communicator._ports = { 'out': Port('out', Operator.O_I, False, True, 1, []), @@ -79,26 +80,26 @@ def communicator2() -> Communicator: pm = communicator._peer_manager pm.is_connected.return_value = True - def gpp(x: Reference) -> Reference: + def gpp(x) -> List[Reference]: if 'out' in str(x): - return Reference('in') - return Reference('out') + return [Reference('in')] + return [Reference('out')] - pm.get_peer_port = gpp + pm.get_peer_ports = gpp pm.get_peer_dims.return_value = [] pm.get_peer_locations.return_value = ['direct:test'] - def gpe(p, s) -> Reference: + def gpe(p, s) -> List[Reference]: endpoint = MagicMock() endpoint.instance.return_value = Reference('kernel[13]') if 'out' in str(p): endpoint.ref.return_value = Reference('kernel[13].in') else: endpoint.ref.return_value = Reference('kernel[13].out') - return endpoint + return [endpoint] - pm.get_peer_endpoint = gpe + pm.get_peer_endpoints = gpe communicator._ports = { 'out': Port('out', Operator.O_I, True, True, 0, [20]), @@ -115,26 +116,26 @@ def communicator3() -> Communicator: pm = communicator._peer_manager pm.is_connected.return_value = True - def gpp(x: Reference) -> Reference: + def gpp(x) -> List[Reference]: if 'out' in str(x): - return Reference('in') - return Reference('out') + return [Reference('in')] + return [Reference('out')] - pm.get_peer_port = gpp + pm.get_peer_ports = gpp pm.get_peer_dims.return_value = [] pm.get_peer_locations.return_value = ['direct:test'] - def gpe(p, s) -> Reference: + def gpe(p, s) -> List[Reference]: endpoint = MagicMock() endpoint.instance.return_value = Reference('other') if 'out' in str(p): endpoint.ref.return_value = Reference('other.in[13]') else: endpoint.ref.return_value = Reference('other.out[13]') - return endpoint + return [endpoint] - pm.get_peer_endpoint = gpe + pm.get_peer_endpoints = gpe communicator._ports = { 'out': Port('out', Operator.O_I, True, True, 0, []), From 4e01031f61308bb0bdd109e954c7c6db977d1f76 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 13 Oct 2022 10:57:30 +0200 Subject: [PATCH 052/183] Rename 'broadcast' to 'multicast' --- integration_test/{test_broadcast.py => test_multicast.py} | 6 +++--- libmuscle/python/libmuscle/communicator.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) rename integration_test/{test_broadcast.py => test_multicast.py} (91%) diff --git a/integration_test/test_broadcast.py b/integration_test/test_multicast.py similarity index 91% rename from integration_test/test_broadcast.py rename to integration_test/test_multicast.py index 0877cd8f..8dedee17 100644 --- a/integration_test/test_broadcast.py +++ b/integration_test/test_multicast.py @@ -5,7 +5,7 @@ from libmuscle.runner import run_simulation -def broadcaster(): +def multicaster(): instance = Instance({Operator.O_F: ['out']}) while instance.reuse_instance(): @@ -23,7 +23,7 @@ def receiver(): assert msg.data == 'testing' -def test_broadcast(log_file_in_tmpdir): +def test_multicast(log_file_in_tmpdir): elements = [ Component('broadcast', 'broadcaster'), Component('first', 'receiver'), @@ -39,6 +39,6 @@ def test_broadcast(log_file_in_tmpdir): configuration = Configuration(model, settings) implementations = { - 'broadcaster': broadcaster, + 'broadcaster': multicaster, 'receiver': receiver} run_simulation(configuration, implementations) diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py index fc24a9a8..cefd5e3b 100644 --- a/libmuscle/python/libmuscle/communicator.py +++ b/libmuscle/python/libmuscle/communicator.py @@ -386,13 +386,13 @@ def __ports_from_declared(self) -> Dict[str, Port]: if port_peer_dims != self._peer_manager.get_peer_dims( peer_ce): port_strs = ', '.join(map(str, peer_ports)) - raise RuntimeError(('Broadcast port "{}" is' + raise RuntimeError(('Multicast port "{}" is' ' connected to peers with' ' different dimensions. All' ' peer components that this' ' port is connected to must' ' have the same multiplicity.' - ' Connected ports: {}.' + ' Connected to ports: {}.' ).format(port_name, port_strs)) else: port_peer_dims = [] From 27c997c68d6dd33e6a596bdb8a1818fac98e22c8 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 14 Oct 2022 14:22:02 +0200 Subject: [PATCH 053/183] tox dependency to ymmsl branch feature/multicast --- tox.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/tox.ini b/tox.ini index 0e2a1348..9c6c3968 100644 --- a/tox.ini +++ b/tox.ini @@ -8,6 +8,7 @@ deps = flake8 pytest pytest-cov + git+https://github.com/multiscale/ymmsl-python.git@feature/multicast#egg=ymmsl passenv = MUSCLE_TEST_PYTHON_ONLY From 839ced29a32d2c7278dd1b9007eecaabe6dc06d9 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 14 Oct 2022 14:22:51 +0200 Subject: [PATCH 054/183] Implement multicast in C++ Analogous to python implementation. --- libmuscle/cpp/src/libmuscle/communicator.cpp | 51 ++++++++++++---- libmuscle/cpp/src/libmuscle/peer_manager.cpp | 60 +++++++++++++------ libmuscle/cpp/src/libmuscle/peer_manager.hpp | 15 ++--- .../tests/mocks/mock_peer_manager.cpp | 8 +-- .../tests/mocks/mock_peer_manager.hpp | 8 +-- .../src/libmuscle/tests/test_communicator.cpp | 45 +++++++++----- .../src/libmuscle/tests/test_peer_manager.cpp | 24 ++++---- 7 files changed, 137 insertions(+), 74 deletions(-) diff --git a/libmuscle/cpp/src/libmuscle/communicator.cpp b/libmuscle/cpp/src/libmuscle/communicator.cpp index 4e9c5139..6b522259 100644 --- a/libmuscle/cpp/src/libmuscle/communicator.cpp +++ b/libmuscle/cpp/src/libmuscle/communicator.cpp @@ -120,7 +120,7 @@ void Communicator::send_message( // TODO start profile event - Endpoint recv_endpoint = peer_manager_->get_peer_endpoint( + auto recv_endpoints = peer_manager_->get_peer_endpoints( snd_endpoint.port, slot_list); Data settings_overlay(message.settings()); @@ -129,17 +129,18 @@ void Communicator::send_message( if (ports_.at(port_name).is_resizable()) port_length = ports_.at(port_name).get_length(); - MPPMessage mpp_message( - snd_endpoint.ref(), recv_endpoint.ref(), - port_length, message.timestamp(), Optional(), - settings_overlay, message.data()); + for (auto recv_endpoint : recv_endpoints) { + MPPMessage mpp_message( + snd_endpoint.ref(), recv_endpoint.ref(), + port_length, message.timestamp(), Optional(), + settings_overlay, message.data()); - if (message.has_next_timestamp()) - mpp_message.next_timestamp = message.next_timestamp(); - - auto message_bytes = std::make_unique(mpp_message.encoded()); - post_office_.deposit(recv_endpoint.ref(), std::move(message_bytes)); + if (message.has_next_timestamp()) + mpp_message.next_timestamp = message.next_timestamp(); + auto message_bytes = std::make_unique(mpp_message.encoded()); + post_office_.deposit(recv_endpoint.ref(), std::move(message_bytes)); + } // TODO: stop and complete profile event } @@ -177,8 +178,10 @@ Message Communicator::receive_message( // TODO start profile event - Endpoint snd_endpoint = peer_manager_->get_peer_endpoint( - recv_endpoint.port, slot_list); + // peer_manager already checks that there is at most one snd_endpoint + // connected to the port we receive on + Endpoint snd_endpoint = peer_manager_->get_peer_endpoints( + recv_endpoint.port, slot_list).at(0); MPPClient & client = get_client_(snd_endpoint.instance()); auto mpp_message = MPPMessage::from_bytes( client.receive(recv_endpoint.ref())); @@ -260,9 +263,31 @@ Communicator::Ports_ Communicator::ports_from_declared_() { bool is_connected = peer_manager_->is_connected(port_name); std::vector port_peer_dims; if (is_connected) { - Reference peer_port = peer_manager_->get_peer_port(port_name); + auto peer_ports = peer_manager_->get_peer_ports(port_name); + Reference peer_port = peer_ports.at(0); Reference peer_ce(peer_port.cbegin(), std::prev(peer_port.cend())); port_peer_dims = peer_manager_->get_peer_dims(peer_ce); + for (std::size_t i = 1; i < peer_ports.size(); i++) { + peer_port = peer_ports.at(i); + peer_ce = Reference(peer_port.cbegin(), std::prev(peer_port.cend())); + if (port_peer_dims != peer_manager_->get_peer_dims(peer_ce)) { + std::stringstream ss; + ss << "Multicast port \"" << port_name; + ss << "\" is connected to peers with different"; + ss << " dimensions. All peer components that this"; + ss << " port is connected to must have the same"; + ss << " multiplicity. Connected to ports: "; + bool first = true; + for (auto port : peer_ports) { + if (first) + first = false; + else + ss << ", "; + ss << port; + } + throw std::runtime_error(ss.str()); + } + } } ports.emplace(port_name, Port( port_name, ppo.first, is_vector, is_connected, diff --git a/libmuscle/cpp/src/libmuscle/peer_manager.cpp b/libmuscle/cpp/src/libmuscle/peer_manager.cpp index 02631bc4..51772072 100644 --- a/libmuscle/cpp/src/libmuscle/peer_manager.cpp +++ b/libmuscle/cpp/src/libmuscle/peer_manager.cpp @@ -1,5 +1,6 @@ #include +#include using ymmsl::Conduit; using ymmsl::Identifier; @@ -22,12 +23,26 @@ PeerManager::PeerManager( , peer_locations_(peer_locations) // indexed by peer instance id { for (auto const & conduit : conduits) { - if (conduit.sending_component() == kernel_) + if (conduit.sending_component() == kernel_) { // we send on the port this conduit attaches to - peers_.emplace(conduit.sender, conduit.receiver); - if (conduit.receiving_component() == kernel_) + auto search = peers_.find(conduit.sender); + if (search == peers_.end()) + search = peers_.emplace( + conduit.sender, std::vector()).first; + search->second.push_back(conduit.receiver); + } + if (conduit.receiving_component() == kernel_) { // we receive on the port this conduit attaches to - peers_.emplace(conduit.receiver, conduit.sender); + if (peers_.count(conduit.receiver)) { + std::stringstream ss; + ss << "Receiving port \"" << conduit.receiving_port(); + ss << "\" is connected by multiple conduits, but at most one"; + ss << " is allowed."; + throw std::runtime_error(ss.str()); + } + std::vector vec = {conduit.sender}; + peers_.emplace(conduit.receiver, vec); + } } } @@ -35,7 +50,8 @@ bool PeerManager::is_connected(Identifier const & port) const { return peers_.count(kernel_ + port); } -Reference PeerManager::get_peer_port(Identifier const & port) const { + std::vector const & PeerManager::get_peer_ports( + Identifier const & port) const { return peers_.at(kernel_ + port); } @@ -49,24 +65,30 @@ std::vector PeerManager::get_peer_locations( return peer_locations_.at(peer_instance); } -Endpoint PeerManager::get_peer_endpoint( +std::vector const PeerManager::get_peer_endpoints( Identifier const & port, std::vector const & slot ) const { - Reference peer = peers_.at(kernel_ + port); - Reference peer_kernel(peer.cbegin(), std::prev(peer.cend())); - Identifier peer_port = std::prev(peer.cend())->identifier(); - - std::vector total_index = index_; - total_index.insert(total_index.end(), slot.cbegin(), slot.cend()); - - // rebalance the indices - int peer_dim = peer_dims_.at(peer_kernel).size(); - auto peer_dim_it = std::next(total_index.cbegin(), peer_dim); - std::vector peer_index(total_index.cbegin(), peer_dim_it); - std::vector peer_slot(peer_dim_it, total_index.cend()); - return Endpoint(peer_kernel, peer_index, peer_port, peer_slot); + auto peers = peers_.at(kernel_ + port); + std::vector endpoints; + + for (auto peer : peers) { + Reference peer_kernel(peer.cbegin(), std::prev(peer.cend())); + Identifier peer_port = std::prev(peer.cend())->identifier(); + + std::vector total_index = index_; + total_index.insert(total_index.end(), slot.cbegin(), slot.cend()); + + // rebalance the indices + int peer_dim = peer_dims_.at(peer_kernel).size(); + auto peer_dim_it = std::next(total_index.cbegin(), peer_dim); + std::vector peer_index(total_index.cbegin(), peer_dim_it); + std::vector peer_slot(peer_dim_it, total_index.cend()); + endpoints.emplace_back(peer_kernel, peer_index, peer_port, peer_slot); + } + + return endpoints; } } } diff --git a/libmuscle/cpp/src/libmuscle/peer_manager.hpp b/libmuscle/cpp/src/libmuscle/peer_manager.hpp index 715bb1aa..c6ac5ff7 100644 --- a/libmuscle/cpp/src/libmuscle/peer_manager.hpp +++ b/libmuscle/cpp/src/libmuscle/peer_manager.hpp @@ -53,12 +53,13 @@ class PeerManager { */ bool is_connected(ymmsl::Identifier const & port) const; - /** Get a reference for the peer port. + /** Get a reference for all the peer ports. * * @param port Name of the port on this side. - * @return Name of the port on the peer. + * @return Names of the port on the peers. */ - ymmsl::Reference get_peer_port(ymmsl::Identifier const & port) const; + std::vector const & get_peer_ports( + ymmsl::Identifier const & port) const; /** Get the dimensions of a peer kernel. * @@ -76,20 +77,20 @@ class PeerManager { std::vector get_peer_locations( ymmsl::Reference const & peer_instance) const; - /** Determine the peer endpoint for the given port and slot. + /** Determine the peer endpoints for the given port and slot. * * @param port The port on our side to send or receive on. * @param slot The slot to send or receive on. - * @return The peer endpoint. + * @return The peer endpoints. */ - Endpoint get_peer_endpoint( + std::vector const get_peer_endpoints( ymmsl::Identifier const & port, std::vector const & slot) const; private: ymmsl::Reference kernel_; std::vector index_; - std::unordered_map peers_; + std::unordered_map> peers_; PeerDims peer_dims_; PeerLocations peer_locations_; }; diff --git a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_peer_manager.cpp b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_peer_manager.cpp index 85893e56..850d8501 100644 --- a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_peer_manager.cpp +++ b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_peer_manager.cpp @@ -28,7 +28,7 @@ bool MockPeerManager::is_connected(Identifier const & port) const { return is_connected_return_value; } -Reference MockPeerManager::get_peer_port(Identifier const & port) const { +std::vector MockPeerManager::get_peer_ports(Identifier const & port) const { return get_peer_port_table.at(port); } @@ -42,7 +42,7 @@ std::vector MockPeerManager::get_peer_locations( return std::vector({std::string("tcp:test")}); } -Endpoint MockPeerManager::get_peer_endpoint( +std::vector MockPeerManager::get_peer_endpoints( Identifier const & port, std::vector const & slot ) const @@ -74,9 +74,9 @@ PeerDims MockPeerManager::last_constructed_peer_dims; PeerLocations MockPeerManager::last_constructed_peer_locations; bool MockPeerManager::is_connected_return_value; -std::unordered_map MockPeerManager::get_peer_port_table; +std::unordered_map> MockPeerManager::get_peer_port_table; std::unordered_map> MockPeerManager::get_peer_dims_table; -std::unordered_map MockPeerManager::get_peer_endpoint_table; +std::unordered_map> MockPeerManager::get_peer_endpoint_table; } } diff --git a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_peer_manager.hpp b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_peer_manager.hpp index e5448662..231ce403 100644 --- a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_peer_manager.hpp +++ b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_peer_manager.hpp @@ -26,14 +26,14 @@ class MockPeerManager { bool is_connected(ymmsl::Identifier const & port) const; - ymmsl::Reference get_peer_port(ymmsl::Identifier const & port) const; + std::vector get_peer_ports(ymmsl::Identifier const & port) const; std::vector get_peer_dims(ymmsl::Reference const & peer_kernel) const; std::vector get_peer_locations( ymmsl::Reference const & peer_instance) const; - Endpoint get_peer_endpoint( + std::vector get_peer_endpoints( ymmsl::Identifier const & port, std::vector const & slot) const; @@ -48,11 +48,11 @@ class MockPeerManager { static PeerLocations last_constructed_peer_locations; static bool is_connected_return_value; - static std::unordered_map + static std::unordered_map> get_peer_port_table; static std::unordered_map> get_peer_dims_table; - static std::unordered_map + static std::unordered_map> get_peer_endpoint_table; }; diff --git a/libmuscle/cpp/src/libmuscle/tests/test_communicator.cpp b/libmuscle/cpp/src/libmuscle/tests/test_communicator.cpp index 61ec225a..4286f452 100644 --- a/libmuscle/cpp/src/libmuscle/tests/test_communicator.cpp +++ b/libmuscle/cpp/src/libmuscle/tests/test_communicator.cpp @@ -109,8 +109,10 @@ std::unique_ptr connected_communicator() { {Reference("other"), {"tcp:test"}}}); MockPeerManager::get_peer_dims_table.emplace("other", std::vector({1})); - MockPeerManager::get_peer_endpoint_table.emplace("out", Endpoint("other", {}, "in", {13})); - MockPeerManager::get_peer_endpoint_table.emplace("in", Endpoint("other", {}, "out", {13})); + MockPeerManager::get_peer_endpoint_table.emplace("out", + std::vector({Endpoint("other", {}, "in", {13})})); + MockPeerManager::get_peer_endpoint_table.emplace("in", + std::vector({Endpoint("other", {}, "out", {13})})); comm->connect(conduits, peer_dims, peer_locations); return std::move(comm); @@ -130,8 +132,10 @@ std::unique_ptr connected_communicator2() { {Reference("kernel"), {"tcp:test"}}}); MockPeerManager::get_peer_dims_table.emplace("kernel", std::vector({20})); - MockPeerManager::get_peer_endpoint_table.emplace("in[13]", Endpoint("kernel", {13}, "out", {})); - MockPeerManager::get_peer_endpoint_table.emplace("out[13]", Endpoint("kernel", {13}, "in", {})); + MockPeerManager::get_peer_endpoint_table.emplace("in[13]", + std::vector({Endpoint("kernel", {13}, "out", {})})); + MockPeerManager::get_peer_endpoint_table.emplace("out[13]", + std::vector({Endpoint("kernel", {13}, "in", {})})); comm->connect(conduits, peer_dims, peer_locations); return std::move(comm); @@ -156,10 +160,14 @@ std::unique_ptr connected_communicator3() { {Reference("other"), {"tcp:test"}}}); MockPeerManager::get_peer_dims_table.emplace("other", std::vector({})); - MockPeerManager::get_peer_endpoint_table.emplace("out[13]", Endpoint("other", {}, "in", {13})); - MockPeerManager::get_peer_endpoint_table.emplace("in[13]", Endpoint("other", {}, "out", {13})); - MockPeerManager::get_peer_port_table.emplace("out", "other.in"); - MockPeerManager::get_peer_port_table.emplace("in", "other.out"); + MockPeerManager::get_peer_endpoint_table.emplace("out[13]", + std::vector({Endpoint("other", {}, "in", {13})})); + MockPeerManager::get_peer_endpoint_table.emplace("in[13]", + std::vector({Endpoint("other", {}, "out", {13})})); + MockPeerManager::get_peer_port_table.emplace("out", + std::vector({"other.in"})); + MockPeerManager::get_peer_port_table.emplace("in", + std::vector({"other.out"})); comm->connect(conduits, peer_dims, peer_locations); return std::move(comm); @@ -239,9 +247,12 @@ TEST(libmuscle_communicator, test_connect_vector_ports) { {Reference("other3"), {"tcp:test3"}} }); - MockPeerManager::get_peer_port_table.emplace("in", "other1.out"); - MockPeerManager::get_peer_port_table.emplace("out1", "other.in"); - MockPeerManager::get_peer_port_table.emplace("out2", "other3.in"); + MockPeerManager::get_peer_port_table.emplace("in", + std::vector({"other1.out"})); + MockPeerManager::get_peer_port_table.emplace("out1", + std::vector({"other.in"})); + MockPeerManager::get_peer_port_table.emplace("out2", + std::vector({"other3.in"})); MockPeerManager::get_peer_dims_table.emplace("other1", std::vector({20, 7})); MockPeerManager::get_peer_dims_table.emplace("other", std::vector({25})); @@ -294,7 +305,8 @@ TEST(libmuscle_communicator, test_connect_multidimensional_ports) { {Reference("other"), {"tcp:test"}} }); - MockPeerManager::get_peer_port_table.emplace("in", "other.out"); + MockPeerManager::get_peer_port_table.emplace("in", + std::vector({"other.out"})); MockPeerManager::get_peer_dims_table.emplace("other", std::vector({20, 7, 30})); ASSERT_THROW( @@ -330,9 +342,12 @@ TEST(libmuscle_communicator, test_connect_inferred_ports) { {Reference("other2"), {"tcp:test2"}} }); - MockPeerManager::get_peer_port_table.emplace("in", "other1.out"); - MockPeerManager::get_peer_port_table.emplace("out1", "other.in"); - MockPeerManager::get_peer_port_table.emplace("out3", "other2.in"); + MockPeerManager::get_peer_port_table.emplace("in", + std::vector({"other1.out"})); + MockPeerManager::get_peer_port_table.emplace("out1", + std::vector({"other.in"})); + MockPeerManager::get_peer_port_table.emplace("out3", + std::vector({"other2.in"})); MockPeerManager::get_peer_dims_table.emplace("other1", std::vector({20, 7})); MockPeerManager::get_peer_dims_table.emplace("other", std::vector({25})); diff --git a/libmuscle/cpp/src/libmuscle/tests/test_peer_manager.cpp b/libmuscle/cpp/src/libmuscle/tests/test_peer_manager.cpp index 7e0f71db..eee0977a 100644 --- a/libmuscle/cpp/src/libmuscle/tests/test_peer_manager.cpp +++ b/libmuscle/cpp/src/libmuscle/tests/test_peer_manager.cpp @@ -82,16 +82,16 @@ TEST(libmuscle_peer_manager, is_connected) { TEST(libmuscle_peer_manager, get_peer_port) { auto pm = peer_manager(); - ASSERT_EQ(pm.get_peer_port("out"), "other.in"); - ASSERT_EQ(pm.get_peer_port("in"), "other.out"); + ASSERT_EQ(pm.get_peer_ports("out"), std::vector({"other.in"})); + ASSERT_EQ(pm.get_peer_ports("in"), std::vector({"other.out"})); auto pm2 = peer_manager2(); - ASSERT_EQ(pm2.get_peer_port("out"), "kernel.in"); - ASSERT_EQ(pm2.get_peer_port("in"), "kernel.out"); + ASSERT_EQ(pm2.get_peer_ports("out"), std::vector({"kernel.in"})); + ASSERT_EQ(pm2.get_peer_ports("in"), std::vector({"kernel.out"})); auto pm3 = peer_manager3(); - ASSERT_EQ(pm3.get_peer_port("out"), "other.in"); - ASSERT_EQ(pm3.get_peer_port("in"), "other.out"); + ASSERT_EQ(pm3.get_peer_ports("out"), std::vector({"other.in"})); + ASSERT_EQ(pm3.get_peer_ports("in"), std::vector({"other.out"})); } TEST(libmuscle_peer_manager, get_peer_dims) { @@ -121,16 +121,16 @@ TEST(libmuscle_peer_manager, get_peer_locations) { TEST(libmuscle_peer_manager, get_peer_endpoint) { auto pm = peer_manager(); - ASSERT_EQ(std::string(pm.get_peer_endpoint("out", {})), "other.in[13]"); - ASSERT_EQ(std::string(pm.get_peer_endpoint("in", {})), "other.out[13]"); + ASSERT_EQ(std::string(pm.get_peer_endpoints("out", {})[0]), "other.in[13]"); + ASSERT_EQ(std::string(pm.get_peer_endpoints("in", {})[0]), "other.out[13]"); auto pm2 = peer_manager2(); - ASSERT_EQ(std::string(pm2.get_peer_endpoint("out", {11})), "kernel[11].in"); - ASSERT_EQ(std::string(pm2.get_peer_endpoint("in", {11})), "kernel[11].out"); + ASSERT_EQ(std::string(pm2.get_peer_endpoints("out", {11})[0]), "kernel[11].in"); + ASSERT_EQ(std::string(pm2.get_peer_endpoints("in", {11})[0]), "kernel[11].out"); auto pm3 = peer_manager3(); - ASSERT_EQ(std::string(pm3.get_peer_endpoint("out", {42})), "other.in[42]"); - ASSERT_EQ(std::string(pm3.get_peer_endpoint("in", {42})), "other.out[42]"); + ASSERT_EQ(std::string(pm3.get_peer_endpoints("out", {42})[0]), "other.in[42]"); + ASSERT_EQ(std::string(pm3.get_peer_endpoints("in", {42})[0]), "other.out[42]"); } From 6876732f024fc8a55fdd432d9e03c4af257dd416 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 14 Oct 2022 14:23:03 +0200 Subject: [PATCH 055/183] Add integration test for c++ multicast --- integration_test/test_multicast_cpp.py | 91 ++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) create mode 100644 integration_test/test_multicast_cpp.py diff --git a/integration_test/test_multicast_cpp.py b/integration_test/test_multicast_cpp.py new file mode 100644 index 00000000..d97fc0d6 --- /dev/null +++ b/integration_test/test_multicast_cpp.py @@ -0,0 +1,91 @@ +from pathlib import Path +import sys + +import ymmsl + +from libmuscle import Instance +from libmuscle.manager.manager import Manager +from libmuscle.manager.run_dir import RunDir + +# when executing this file as a component, .conftest cannot be resolved +if __name__ == "__main__": + def skip_if_python_only(func): + return func +else: + from .conftest import skip_if_python_only + + +def receiver(): + instance = Instance({ymmsl.Operator.F_INIT: ['in']}) + + i = 0 + while instance.reuse_instance(): + # f_init + msg = instance.receive('in') + assert msg.data == i + assert isinstance(msg.data, int) + i += 1 + + +@skip_if_python_only +def test_multicast_cpp(tmpdir): + tmppath = Path(str(tmpdir)) + + # find our test component and its requirements + cpp_build_dir = Path(__file__).parents[1] / 'libmuscle' / 'cpp' / 'build' + lib_paths = [cpp_build_dir / 'msgpack' / 'msgpack' / 'lib'] + ld_lib_path = ':'.join(map(str, lib_paths)) + + cpp_test_dir = cpp_build_dir / 'libmuscle' / 'tests' + test_component = cpp_test_dir / 'component_test' + + # make config + ymmsl_text = f""" +ymmsl_version: v0.1 +model: + name: test_model + components: + multicast: + implementation: component + receiver1: + implementation: receiver + receiver2: + implementation: receiver + conduits: + multicast.out: + - receiver1.in + - receiver2.in +implementations: + component: + env: + LD_LIBRARY_PATH: {ld_lib_path} + executable: {test_component} + receiver: + executable: {sys.executable} + args: + - {__file__} +resources: + multicast: + threads: 1 + receiver1: + threads: 1 + receiver2: + threads: 1""" + + config = ymmsl.load(ymmsl_text) + config.as_configuration().check_consistent() + + # set up + run_dir = RunDir(tmppath / 'run') + + # launch MUSCLE Manager with simulation + manager = Manager(config, run_dir) + manager.start_instances() + success = manager.wait() + + # check that all did not go well + assert success + + +if __name__ == "__main__": + receiver() From 4f979999aef870452104a74a2d04fc1b7e0b721f Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 18 Oct 2022 11:51:03 +0200 Subject: [PATCH 056/183] Ensure correct timestamp type in Message Fixes #118 When an incorrect type is provided by the user for Message.timestamp or Message.next_timestamp, MsgPack will serialize an invalid MMPMessage on the wire. This leads to errors in statically typed peer actors. Issue is fixed by explicitly converting to float in Message.__init__ and checking again in MMPMessage.__init__ (as the user may have assigned another value between creation of the Message and Instance.send). --- libmuscle/python/libmuscle/communicator.py | 5 +++++ libmuscle/python/libmuscle/mpp_message.py | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py index efdcb30d..fec6e736 100644 --- a/libmuscle/python/libmuscle/communicator.py +++ b/libmuscle/python/libmuscle/communicator.py @@ -49,6 +49,11 @@ def __init__(self, timestamp: float, next_timestamp: Optional[float], data: An object to send or that was received. settings: Overlay settings to send or that were received. """ + # make sure timestamp and next_timestamp are floats + timestamp = float(timestamp) + if next_timestamp is not None: + next_timestamp = float(next_timestamp) + self.timestamp = timestamp self.next_timestamp = next_timestamp self.data = data diff --git a/libmuscle/python/libmuscle/mpp_message.py b/libmuscle/python/libmuscle/mpp_message.py index 15ff09f9..8b84aeab 100644 --- a/libmuscle/python/libmuscle/mpp_message.py +++ b/libmuscle/python/libmuscle/mpp_message.py @@ -171,6 +171,11 @@ def __init__(self, sender: Reference, receiver: Reference, settings_overlay: The serialised overlay settings. data: The serialised contents of the message. """ + # make sure timestamp and next_timestamp are floats + timestamp = float(timestamp) + if next_timestamp is not None: + next_timestamp = float(next_timestamp) + self.sender = sender self.receiver = receiver self.port_length = port_length From 29e9132fd738635d49271fd1188b25ac992dcf85 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 18 Oct 2022 14:34:43 +0200 Subject: [PATCH 057/183] Add documentation on coupling with multicast --- docs/source/coupling.rst | 52 ++++++++++++++++++++++++++++++++++++++++ docs/source/index.rst | 1 + 2 files changed, 53 insertions(+) create mode 100644 docs/source/coupling.rst diff --git a/docs/source/coupling.rst b/docs/source/coupling.rst new file mode 100644 index 00000000..8c764a97 --- /dev/null +++ b/docs/source/coupling.rst @@ -0,0 +1,52 @@ +Coupling your model +=================== + +Multicast +--------- + +With MUSCLE3 you can connect an output port to multiple input ports. This is +called multicast. When a submodel sends a message on a port that is connected to +multiple input ports, the message is copied and sent to each connected port. + +.. note:: + + It is not allowed to connect multiple output ports to a single input port. + +Example +``````` + +.. tabs:: + + .. code-tab:: yaml Basic macro/micro model configuration + + ymmsl_version: v0.1 + model: + name: multicast + components: + macro: macro + micro: micro + conduits: + macro.state_out: micro.state_in + micro.state_out: macro.state_in + + .. code-tab:: yaml Extended configuration with multicast + + ymmsl_version: v0.1 + model: + name: multicast + components: + macro: macro + micro: micro + printer: printer + conduits: + macro.state_out: micro.state_in + micro.state_out: + - macro.state_in + - printer.in + +In the second tab, a new component `printer` is added and wired to the +``state_out`` port of the micro model. Whenever the micro model sends a message +on that port, one copy is sent to the macro model to continue the simulation. +Another copy is sent to the printer component, which (for example) prints a +summary of the state. + diff --git a/docs/source/index.rst b/docs/source/index.rst index f9b2096c..ed55ba48 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -33,6 +33,7 @@ Cham. ``_ installing tutorial distributed_execution + coupling cplusplus fortran mpi From 3de906f5fe4cab6192f5f9e111267d4b070bf6bb Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 24 Oct 2022 17:24:38 +0200 Subject: [PATCH 058/183] Fix make errors when CXX=clang++ --- libmuscle/cpp/src/ymmsl/identity.cpp | 2 +- libmuscle/cpp/src/ymmsl/identity.hpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libmuscle/cpp/src/ymmsl/identity.cpp b/libmuscle/cpp/src/ymmsl/identity.cpp index d9816b7f..e9bf7266 100644 --- a/libmuscle/cpp/src/ymmsl/identity.cpp +++ b/libmuscle/cpp/src/ymmsl/identity.cpp @@ -10,7 +10,7 @@ using namespace std::string_literals; -::std::size_t ::std::hash<::ymmsl::impl::Identifier>::operator()( +::std::size_t (::std::hash<::ymmsl::impl::Identifier>::operator())( argument_type const & id) const noexcept { return hash()(id.data_); diff --git a/libmuscle/cpp/src/ymmsl/identity.hpp b/libmuscle/cpp/src/ymmsl/identity.hpp index a6197d51..f06af00e 100644 --- a/libmuscle/cpp/src/ymmsl/identity.hpp +++ b/libmuscle/cpp/src/ymmsl/identity.hpp @@ -109,8 +109,8 @@ class Identifier { friend bool operator==(std::string const & lhs, Identifier const & rhs); friend bool operator!=(std::string const & lhs, Identifier const & rhs); friend std::ostream & operator<<(std::ostream & os, Identifier const & i); - friend ::std::size_t ::std::hash<::ymmsl::impl::Identifier>::operator()( - ::ymmsl::impl::Identifier const & id) const; + friend ::std::size_t (::std::hash<::ymmsl::impl::Identifier>::operator())( + ::ymmsl::impl::Identifier const & id) const noexcept; std::string data_; }; From 2003c1df9cdf87da75f68e4362adbbc69f28bc5d Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Sun, 30 Oct 2022 08:57:27 +0100 Subject: [PATCH 059/183] Update requirements.txt for documentation build --- docs/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/requirements.txt b/docs/requirements.txt index 2508a213..98b8214f 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,7 +1,9 @@ breathe +click msgpack==0.6.1 netifaces numpy>=1.12 +qcg-pilotjob six sphinx-fortran sphinx-tabs From 39ad298c675126d9463c0d13495822e61a3dd55c Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Wed, 2 Nov 2022 13:55:01 +0100 Subject: [PATCH 060/183] Fix issue with dependency compilation on clang --- libmuscle/cpp/build/libmuscle/tests/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libmuscle/cpp/build/libmuscle/tests/Makefile b/libmuscle/cpp/build/libmuscle/tests/Makefile index 0a22272b..327b4ddb 100644 --- a/libmuscle/cpp/build/libmuscle/tests/Makefile +++ b/libmuscle/cpp/build/libmuscle/tests/Makefile @@ -61,6 +61,9 @@ endif %.d: %.cpp $(CXX) $(CPPFLAGS) $(CXXFLAGS) -E -MM -MT $(@:.d=.o) $< -o $@ +mpi%.d: mpi%.cpp + $(MPICXX) $(CPPFLAGS) $(CXXFLAGS) -E -MM -MT $(@:.d=.o) $< -o $@ + %.o: %.cpp $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(DEBUGFLAGS) -c $< -o $@ From 78d9204b2db87b76dbf99760abfe35b5422373dc Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Wed, 2 Nov 2022 14:14:33 +0100 Subject: [PATCH 061/183] Update release docs to check RTD docs render before releasing. --- docs/source/releasing.rst | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/docs/source/releasing.rst b/docs/source/releasing.rst index eff40de4..e93feb0f 100644 --- a/docs/source/releasing.rst +++ b/docs/source/releasing.rst @@ -8,6 +8,23 @@ branching model. Making a release involves quite a few steps, so they're listed here to help make the process more reliable; this information is really only useful for the maintainers. +Check online documentation +-------------------------- + +Online documentation rendering on ReadTheDoc works a bit differently than local +builds, as a result of which checking a local documentation build only partially +ensures we get working online documentation. So this needs to be checked: + +- Check develop branch documentation is there +- Specifically, check the Python API documentation page +- Check the other languages too + +If the Python API docs are missing, then it's likely to be a dependency problem. +Sphinx needs dependencies installed, and that's done differently by tox (which +uses `setup.py`) and RTD (which uses `docs/requirements.txt`). If the latter is +outdated, the Python API docs don't render because Sphinx fails to import the +packages. + Check metadata -------------- From 848ef7b6ef49ef593c6643695390979364fe363e Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 9 Aug 2022 13:30:41 +0200 Subject: [PATCH 062/183] Add `tox` as a [dev] dependency --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ade2fa38..b99e2d06 100644 --- a/setup.py +++ b/setup.py @@ -61,7 +61,8 @@ 'dev': [ 'sphinx<3.2', 'sphinx_rtd_theme', - 'sphinx-fortran' + 'sphinx-fortran', + 'tox' ] }, ) From 19fe89959cdf0b3591cfbde26c0c6c97768a3932 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Wed, 2 Nov 2022 15:34:21 +0100 Subject: [PATCH 063/183] Fix 'non-void function does not return' warnings All fortran_c wrapper functions now return a default value when catching an error from the C++ API. Fixes the compiler warnings: 'non-void function does not return a value in all control paths [-Wreturn-type]' --- .../bindings/libmuscle_fortran_c.cpp | 44 ++++++++++++++ .../bindings/libmuscle_mpi_fortran_c.cpp | 44 ++++++++++++++ .../src/ymmsl/bindings/ymmsl_fortran_c.cpp | 11 ++++ scripts/api_generator.py | 58 +++++++++++++++++++ scripts/make_libmuscle_api.py | 6 ++ 5 files changed, 163 insertions(+) diff --git a/libmuscle/cpp/src/libmuscle/bindings/libmuscle_fortran_c.cpp b/libmuscle/cpp/src/libmuscle/bindings/libmuscle_fortran_c.cpp index 1ed5ea5d..418a9c89 100644 --- a/libmuscle/cpp/src/libmuscle/bindings/libmuscle_fortran_c.cpp +++ b/libmuscle/cpp/src/libmuscle/bindings/libmuscle_fortran_c.cpp @@ -472,6 +472,7 @@ bool LIBMUSCLE_DataConstRef_as_logical_(std::intptr_t self, int * err_code, char *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } void LIBMUSCLE_DataConstRef_as_character_(std::intptr_t self, char ** ret_val, std::size_t * ret_val_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -563,6 +564,7 @@ int LIBMUSCLE_DataConstRef_as_int_(std::intptr_t self, int * err_code, char ** e *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } char LIBMUSCLE_DataConstRef_as_int1_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -607,6 +609,7 @@ char LIBMUSCLE_DataConstRef_as_int1_(std::intptr_t self, int * err_code, char ** *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } short int LIBMUSCLE_DataConstRef_as_int2_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -651,6 +654,7 @@ short int LIBMUSCLE_DataConstRef_as_int2_(std::intptr_t self, int * err_code, ch *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } int32_t LIBMUSCLE_DataConstRef_as_int4_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -695,6 +699,7 @@ int32_t LIBMUSCLE_DataConstRef_as_int4_(std::intptr_t self, int * err_code, char *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } int64_t LIBMUSCLE_DataConstRef_as_int8_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -739,6 +744,7 @@ int64_t LIBMUSCLE_DataConstRef_as_int8_(std::intptr_t self, int * err_code, char *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } float LIBMUSCLE_DataConstRef_as_real4_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -783,6 +789,7 @@ float LIBMUSCLE_DataConstRef_as_real4_(std::intptr_t self, int * err_code, char *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0.0; } double LIBMUSCLE_DataConstRef_as_real8_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -827,6 +834,7 @@ double LIBMUSCLE_DataConstRef_as_real8_(std::intptr_t self, int * err_code, char *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0.0; } std::intptr_t LIBMUSCLE_DataConstRef_as_settings_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -871,6 +879,7 @@ std::intptr_t LIBMUSCLE_DataConstRef_as_settings_(std::intptr_t self, int * err_ *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } void LIBMUSCLE_DataConstRef_as_byte_array_( @@ -918,6 +927,7 @@ std::intptr_t LIBMUSCLE_DataConstRef_get_item_by_key_( *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::intptr_t LIBMUSCLE_DataConstRef_get_item_by_index_( @@ -945,6 +955,7 @@ std::intptr_t LIBMUSCLE_DataConstRef_get_item_by_index_( *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::size_t LIBMUSCLE_DataConstRef_num_dims_( @@ -963,6 +974,7 @@ std::size_t LIBMUSCLE_DataConstRef_num_dims_( *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } void LIBMUSCLE_DataConstRef_shape_(std::intptr_t self, std::size_t ** shp, std::size_t * shp_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -1209,6 +1221,7 @@ bool LIBMUSCLE_DataConstRef_has_indexes_(std::intptr_t self, int * err_code, cha *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } void LIBMUSCLE_DataConstRef_index_(std::intptr_t self, std::size_t i, char ** ret_val, std::size_t * ret_val_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -1709,6 +1722,7 @@ bool LIBMUSCLE_Data_as_logical_(std::intptr_t self, int * err_code, char ** err_ *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } void LIBMUSCLE_Data_as_character_(std::intptr_t self, char ** ret_val, std::size_t * ret_val_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -1800,6 +1814,7 @@ int LIBMUSCLE_Data_as_int_(std::intptr_t self, int * err_code, char ** err_msg, *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } char LIBMUSCLE_Data_as_int1_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -1844,6 +1859,7 @@ char LIBMUSCLE_Data_as_int1_(std::intptr_t self, int * err_code, char ** err_msg *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } short int LIBMUSCLE_Data_as_int2_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -1888,6 +1904,7 @@ short int LIBMUSCLE_Data_as_int2_(std::intptr_t self, int * err_code, char ** er *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } int32_t LIBMUSCLE_Data_as_int4_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -1932,6 +1949,7 @@ int32_t LIBMUSCLE_Data_as_int4_(std::intptr_t self, int * err_code, char ** err_ *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } int64_t LIBMUSCLE_Data_as_int8_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -1976,6 +1994,7 @@ int64_t LIBMUSCLE_Data_as_int8_(std::intptr_t self, int * err_code, char ** err_ *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } float LIBMUSCLE_Data_as_real4_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -2020,6 +2039,7 @@ float LIBMUSCLE_Data_as_real4_(std::intptr_t self, int * err_code, char ** err_m *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0.0; } double LIBMUSCLE_Data_as_real8_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -2064,6 +2084,7 @@ double LIBMUSCLE_Data_as_real8_(std::intptr_t self, int * err_code, char ** err_ *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0.0; } std::intptr_t LIBMUSCLE_Data_as_settings_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -2108,6 +2129,7 @@ std::intptr_t LIBMUSCLE_Data_as_settings_(std::intptr_t self, int * err_code, ch *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } void LIBMUSCLE_Data_as_byte_array_( @@ -2155,6 +2177,7 @@ std::intptr_t LIBMUSCLE_Data_get_item_by_key_( *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::intptr_t LIBMUSCLE_Data_get_item_by_index_( @@ -2182,6 +2205,7 @@ std::intptr_t LIBMUSCLE_Data_get_item_by_index_( *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::size_t LIBMUSCLE_Data_num_dims_( @@ -2200,6 +2224,7 @@ std::size_t LIBMUSCLE_Data_num_dims_( *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } void LIBMUSCLE_Data_shape_(std::intptr_t self, std::size_t ** shp, std::size_t * shp_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -2446,6 +2471,7 @@ bool LIBMUSCLE_Data_has_indexes_(std::intptr_t self, int * err_code, char ** err *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } void LIBMUSCLE_Data_index_(std::intptr_t self, std::size_t i, char ** ret_val, std::size_t * ret_val_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -3444,6 +3470,7 @@ std::intptr_t LIBMUSCLE_Data_value_( *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::intptr_t LIBMUSCLE_PortsDescription_create_() { @@ -3721,6 +3748,7 @@ bool LIBMUSCLE_Instance_is_setting_a_character_(std::intptr_t self, char * name, *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } bool LIBMUSCLE_Instance_is_setting_a_int8_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -3766,6 +3794,7 @@ bool LIBMUSCLE_Instance_is_setting_a_int8_(std::intptr_t self, char * name, std: *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } bool LIBMUSCLE_Instance_is_setting_a_real8_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -3811,6 +3840,7 @@ bool LIBMUSCLE_Instance_is_setting_a_real8_(std::intptr_t self, char * name, std *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } bool LIBMUSCLE_Instance_is_setting_a_logical_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -3856,6 +3886,7 @@ bool LIBMUSCLE_Instance_is_setting_a_logical_(std::intptr_t self, char * name, s *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } bool LIBMUSCLE_Instance_is_setting_a_real8array_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -3901,6 +3932,7 @@ bool LIBMUSCLE_Instance_is_setting_a_real8array_(std::intptr_t self, char * name *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } bool LIBMUSCLE_Instance_is_setting_a_real8array2_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -3946,6 +3978,7 @@ bool LIBMUSCLE_Instance_is_setting_a_real8array2_(std::intptr_t self, char * nam *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } void LIBMUSCLE_Instance_get_setting_as_character_(std::intptr_t self, char * name, std::size_t name_size, char ** ret_val, std::size_t * ret_val_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -4039,6 +4072,7 @@ int64_t LIBMUSCLE_Instance_get_setting_as_int8_(std::intptr_t self, char * name, *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } double LIBMUSCLE_Instance_get_setting_as_real8_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -4084,6 +4118,7 @@ double LIBMUSCLE_Instance_get_setting_as_real8_(std::intptr_t self, char * name, *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0.0; } bool LIBMUSCLE_Instance_get_setting_as_logical_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -4129,6 +4164,7 @@ bool LIBMUSCLE_Instance_get_setting_as_logical_(std::intptr_t self, char * name, *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } void LIBMUSCLE_Instance_get_setting_as_real8array_(std::intptr_t self, char * name, std::size_t name_size, double ** value, std::size_t * value_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -4337,6 +4373,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_p_(std::intptr_t self, char * port_name *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::intptr_t LIBMUSCLE_Instance_receive_pd_(std::intptr_t self, char * port_name, std::size_t port_name_size, std::intptr_t default_msg, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -4383,6 +4420,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_pd_(std::intptr_t self, char * port_nam *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::intptr_t LIBMUSCLE_Instance_receive_ps_(std::intptr_t self, char * port_name, std::size_t port_name_size, int slot, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -4428,6 +4466,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_ps_(std::intptr_t self, char * port_nam *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::intptr_t LIBMUSCLE_Instance_receive_psd_(std::intptr_t self, char * port_name, std::size_t port_name_size, int slot, std::intptr_t default_message, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -4474,6 +4513,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_psd_(std::intptr_t self, char * port_na *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::intptr_t LIBMUSCLE_Instance_receive_with_settings_p_(std::intptr_t self, char * port_name, std::size_t port_name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -4519,6 +4559,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_with_settings_p_(std::intptr_t self, ch *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::intptr_t LIBMUSCLE_Instance_receive_with_settings_pd_(std::intptr_t self, char * port_name, std::size_t port_name_size, std::intptr_t default_msg, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -4565,6 +4606,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_with_settings_pd_(std::intptr_t self, c *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::intptr_t LIBMUSCLE_Instance_receive_with_settings_ps_(std::intptr_t self, char * port_name, std::size_t port_name_size, int slot, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -4610,6 +4652,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_with_settings_ps_(std::intptr_t self, c *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::intptr_t LIBMUSCLE_Instance_receive_with_settings_psd_(std::intptr_t self, char * port_name, std::size_t port_name_size, int slot, std::intptr_t default_msg, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -4656,6 +4699,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_with_settings_psd_(std::intptr_t self, *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::intptr_t LIBMUSCLE_IMPL_BINDINGS_CmdLineArgs_create_(int count) { diff --git a/libmuscle/cpp/src/libmuscle/bindings/libmuscle_mpi_fortran_c.cpp b/libmuscle/cpp/src/libmuscle/bindings/libmuscle_mpi_fortran_c.cpp index c66b971f..877bb2a6 100644 --- a/libmuscle/cpp/src/libmuscle/bindings/libmuscle_mpi_fortran_c.cpp +++ b/libmuscle/cpp/src/libmuscle/bindings/libmuscle_mpi_fortran_c.cpp @@ -472,6 +472,7 @@ bool LIBMUSCLE_DataConstRef_as_logical_(std::intptr_t self, int * err_code, char *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } void LIBMUSCLE_DataConstRef_as_character_(std::intptr_t self, char ** ret_val, std::size_t * ret_val_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -563,6 +564,7 @@ int LIBMUSCLE_DataConstRef_as_int_(std::intptr_t self, int * err_code, char ** e *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } char LIBMUSCLE_DataConstRef_as_int1_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -607,6 +609,7 @@ char LIBMUSCLE_DataConstRef_as_int1_(std::intptr_t self, int * err_code, char ** *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } short int LIBMUSCLE_DataConstRef_as_int2_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -651,6 +654,7 @@ short int LIBMUSCLE_DataConstRef_as_int2_(std::intptr_t self, int * err_code, ch *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } int32_t LIBMUSCLE_DataConstRef_as_int4_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -695,6 +699,7 @@ int32_t LIBMUSCLE_DataConstRef_as_int4_(std::intptr_t self, int * err_code, char *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } int64_t LIBMUSCLE_DataConstRef_as_int8_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -739,6 +744,7 @@ int64_t LIBMUSCLE_DataConstRef_as_int8_(std::intptr_t self, int * err_code, char *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } float LIBMUSCLE_DataConstRef_as_real4_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -783,6 +789,7 @@ float LIBMUSCLE_DataConstRef_as_real4_(std::intptr_t self, int * err_code, char *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0.0; } double LIBMUSCLE_DataConstRef_as_real8_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -827,6 +834,7 @@ double LIBMUSCLE_DataConstRef_as_real8_(std::intptr_t self, int * err_code, char *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0.0; } std::intptr_t LIBMUSCLE_DataConstRef_as_settings_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -871,6 +879,7 @@ std::intptr_t LIBMUSCLE_DataConstRef_as_settings_(std::intptr_t self, int * err_ *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } void LIBMUSCLE_DataConstRef_as_byte_array_( @@ -918,6 +927,7 @@ std::intptr_t LIBMUSCLE_DataConstRef_get_item_by_key_( *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::intptr_t LIBMUSCLE_DataConstRef_get_item_by_index_( @@ -945,6 +955,7 @@ std::intptr_t LIBMUSCLE_DataConstRef_get_item_by_index_( *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::size_t LIBMUSCLE_DataConstRef_num_dims_( @@ -963,6 +974,7 @@ std::size_t LIBMUSCLE_DataConstRef_num_dims_( *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } void LIBMUSCLE_DataConstRef_shape_(std::intptr_t self, std::size_t ** shp, std::size_t * shp_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -1209,6 +1221,7 @@ bool LIBMUSCLE_DataConstRef_has_indexes_(std::intptr_t self, int * err_code, cha *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } void LIBMUSCLE_DataConstRef_index_(std::intptr_t self, std::size_t i, char ** ret_val, std::size_t * ret_val_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -1709,6 +1722,7 @@ bool LIBMUSCLE_Data_as_logical_(std::intptr_t self, int * err_code, char ** err_ *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } void LIBMUSCLE_Data_as_character_(std::intptr_t self, char ** ret_val, std::size_t * ret_val_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -1800,6 +1814,7 @@ int LIBMUSCLE_Data_as_int_(std::intptr_t self, int * err_code, char ** err_msg, *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } char LIBMUSCLE_Data_as_int1_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -1844,6 +1859,7 @@ char LIBMUSCLE_Data_as_int1_(std::intptr_t self, int * err_code, char ** err_msg *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } short int LIBMUSCLE_Data_as_int2_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -1888,6 +1904,7 @@ short int LIBMUSCLE_Data_as_int2_(std::intptr_t self, int * err_code, char ** er *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } int32_t LIBMUSCLE_Data_as_int4_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -1932,6 +1949,7 @@ int32_t LIBMUSCLE_Data_as_int4_(std::intptr_t self, int * err_code, char ** err_ *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } int64_t LIBMUSCLE_Data_as_int8_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -1976,6 +1994,7 @@ int64_t LIBMUSCLE_Data_as_int8_(std::intptr_t self, int * err_code, char ** err_ *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } float LIBMUSCLE_Data_as_real4_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -2020,6 +2039,7 @@ float LIBMUSCLE_Data_as_real4_(std::intptr_t self, int * err_code, char ** err_m *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0.0; } double LIBMUSCLE_Data_as_real8_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -2064,6 +2084,7 @@ double LIBMUSCLE_Data_as_real8_(std::intptr_t self, int * err_code, char ** err_ *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0.0; } std::intptr_t LIBMUSCLE_Data_as_settings_(std::intptr_t self, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -2108,6 +2129,7 @@ std::intptr_t LIBMUSCLE_Data_as_settings_(std::intptr_t self, int * err_code, ch *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } void LIBMUSCLE_Data_as_byte_array_( @@ -2155,6 +2177,7 @@ std::intptr_t LIBMUSCLE_Data_get_item_by_key_( *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::intptr_t LIBMUSCLE_Data_get_item_by_index_( @@ -2182,6 +2205,7 @@ std::intptr_t LIBMUSCLE_Data_get_item_by_index_( *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::size_t LIBMUSCLE_Data_num_dims_( @@ -2200,6 +2224,7 @@ std::size_t LIBMUSCLE_Data_num_dims_( *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } void LIBMUSCLE_Data_shape_(std::intptr_t self, std::size_t ** shp, std::size_t * shp_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -2446,6 +2471,7 @@ bool LIBMUSCLE_Data_has_indexes_(std::intptr_t self, int * err_code, char ** err *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } void LIBMUSCLE_Data_index_(std::intptr_t self, std::size_t i, char ** ret_val, std::size_t * ret_val_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -3444,6 +3470,7 @@ std::intptr_t LIBMUSCLE_Data_value_( *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::intptr_t LIBMUSCLE_PortsDescription_create_() { @@ -3728,6 +3755,7 @@ bool LIBMUSCLE_Instance_is_setting_a_character_(std::intptr_t self, char * name, *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } bool LIBMUSCLE_Instance_is_setting_a_int8_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -3773,6 +3801,7 @@ bool LIBMUSCLE_Instance_is_setting_a_int8_(std::intptr_t self, char * name, std: *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } bool LIBMUSCLE_Instance_is_setting_a_real8_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -3818,6 +3847,7 @@ bool LIBMUSCLE_Instance_is_setting_a_real8_(std::intptr_t self, char * name, std *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } bool LIBMUSCLE_Instance_is_setting_a_logical_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -3863,6 +3893,7 @@ bool LIBMUSCLE_Instance_is_setting_a_logical_(std::intptr_t self, char * name, s *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } bool LIBMUSCLE_Instance_is_setting_a_real8array_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -3908,6 +3939,7 @@ bool LIBMUSCLE_Instance_is_setting_a_real8array_(std::intptr_t self, char * name *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } bool LIBMUSCLE_Instance_is_setting_a_real8array2_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -3953,6 +3985,7 @@ bool LIBMUSCLE_Instance_is_setting_a_real8array2_(std::intptr_t self, char * nam *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } void LIBMUSCLE_Instance_get_setting_as_character_(std::intptr_t self, char * name, std::size_t name_size, char ** ret_val, std::size_t * ret_val_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -4046,6 +4079,7 @@ int64_t LIBMUSCLE_Instance_get_setting_as_int8_(std::intptr_t self, char * name, *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } double LIBMUSCLE_Instance_get_setting_as_real8_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -4091,6 +4125,7 @@ double LIBMUSCLE_Instance_get_setting_as_real8_(std::intptr_t self, char * name, *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0.0; } bool LIBMUSCLE_Instance_get_setting_as_logical_(std::intptr_t self, char * name, std::size_t name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -4136,6 +4171,7 @@ bool LIBMUSCLE_Instance_get_setting_as_logical_(std::intptr_t self, char * name, *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } void LIBMUSCLE_Instance_get_setting_as_real8array_(std::intptr_t self, char * name, std::size_t name_size, double ** value, std::size_t * value_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -4344,6 +4380,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_p_(std::intptr_t self, char * port_name *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::intptr_t LIBMUSCLE_Instance_receive_pd_(std::intptr_t self, char * port_name, std::size_t port_name_size, std::intptr_t default_msg, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -4390,6 +4427,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_pd_(std::intptr_t self, char * port_nam *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::intptr_t LIBMUSCLE_Instance_receive_ps_(std::intptr_t self, char * port_name, std::size_t port_name_size, int slot, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -4435,6 +4473,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_ps_(std::intptr_t self, char * port_nam *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::intptr_t LIBMUSCLE_Instance_receive_psd_(std::intptr_t self, char * port_name, std::size_t port_name_size, int slot, std::intptr_t default_message, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -4481,6 +4520,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_psd_(std::intptr_t self, char * port_na *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::intptr_t LIBMUSCLE_Instance_receive_with_settings_p_(std::intptr_t self, char * port_name, std::size_t port_name_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -4526,6 +4566,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_with_settings_p_(std::intptr_t self, ch *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::intptr_t LIBMUSCLE_Instance_receive_with_settings_pd_(std::intptr_t self, char * port_name, std::size_t port_name_size, std::intptr_t default_msg, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -4572,6 +4613,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_with_settings_pd_(std::intptr_t self, c *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::intptr_t LIBMUSCLE_Instance_receive_with_settings_ps_(std::intptr_t self, char * port_name, std::size_t port_name_size, int slot, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -4617,6 +4659,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_with_settings_ps_(std::intptr_t self, c *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::intptr_t LIBMUSCLE_Instance_receive_with_settings_psd_(std::intptr_t self, char * port_name, std::size_t port_name_size, int slot, std::intptr_t default_msg, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -4663,6 +4706,7 @@ std::intptr_t LIBMUSCLE_Instance_receive_with_settings_psd_(std::intptr_t self, *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } std::intptr_t LIBMUSCLE_IMPL_BINDINGS_CmdLineArgs_create_(int count) { diff --git a/libmuscle/cpp/src/ymmsl/bindings/ymmsl_fortran_c.cpp b/libmuscle/cpp/src/ymmsl/bindings/ymmsl_fortran_c.cpp index f642e515..6c7e7ff5 100644 --- a/libmuscle/cpp/src/ymmsl/bindings/ymmsl_fortran_c.cpp +++ b/libmuscle/cpp/src/ymmsl/bindings/ymmsl_fortran_c.cpp @@ -87,6 +87,7 @@ bool YMMSL_Settings_is_a_character_(std::intptr_t self, char * key, std::size_t *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } bool YMMSL_Settings_is_a_int4_(std::intptr_t self, char * key, std::size_t key_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -132,6 +133,7 @@ bool YMMSL_Settings_is_a_int4_(std::intptr_t self, char * key, std::size_t key_s *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } bool YMMSL_Settings_is_a_int8_(std::intptr_t self, char * key, std::size_t key_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -177,6 +179,7 @@ bool YMMSL_Settings_is_a_int8_(std::intptr_t self, char * key, std::size_t key_s *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } bool YMMSL_Settings_is_a_real8_(std::intptr_t self, char * key, std::size_t key_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -222,6 +225,7 @@ bool YMMSL_Settings_is_a_real8_(std::intptr_t self, char * key, std::size_t key_ *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } bool YMMSL_Settings_is_a_logical_(std::intptr_t self, char * key, std::size_t key_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -267,6 +271,7 @@ bool YMMSL_Settings_is_a_logical_(std::intptr_t self, char * key, std::size_t ke *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } bool YMMSL_Settings_is_a_real8array_(std::intptr_t self, char * key, std::size_t key_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -312,6 +317,7 @@ bool YMMSL_Settings_is_a_real8array_(std::intptr_t self, char * key, std::size_t *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } bool YMMSL_Settings_is_a_real8array2_(std::intptr_t self, char * key, std::size_t key_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -357,6 +363,7 @@ bool YMMSL_Settings_is_a_real8array2_(std::intptr_t self, char * key, std::size_ *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } void YMMSL_Settings_set_character_(std::intptr_t self, char * key, std::size_t key_size, char * value, std::size_t value_size) { @@ -506,6 +513,7 @@ int32_t YMMSL_Settings_get_as_int4_(std::intptr_t self, char * key, std::size_t *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } int64_t YMMSL_Settings_get_as_int8_(std::intptr_t self, char * key, std::size_t key_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -551,6 +559,7 @@ int64_t YMMSL_Settings_get_as_int8_(std::intptr_t self, char * key, std::size_t *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0; } double YMMSL_Settings_get_as_real8_(std::intptr_t self, char * key, std::size_t key_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -596,6 +605,7 @@ double YMMSL_Settings_get_as_real8_(std::intptr_t self, char * key, std::size_t *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return 0.0; } bool YMMSL_Settings_get_as_logical_(std::intptr_t self, char * key, std::size_t key_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { @@ -641,6 +651,7 @@ bool YMMSL_Settings_get_as_logical_(std::intptr_t self, char * key, std::size_t *err_msg = const_cast(msg.data()); *err_msg_len = msg.size(); } + return false; } void YMMSL_Settings_get_as_real8array_(std::intptr_t self, char * key, std::size_t key_size, double ** value, std::size_t * value_size, int * err_code, char ** err_msg, std::size_t * err_msg_len) { diff --git a/scripts/api_generator.py b/scripts/api_generator.py index 2823c4fc..e6c4f5a6 100644 --- a/scripts/api_generator.py +++ b/scripts/api_generator.py @@ -132,6 +132,9 @@ def f_call_c(self, result_name: str, call: str) -> str: def f_return_result(self, return_name: str, result_name: str) -> str: return '' + def fc_return_default(self) -> str: + return '' # memfun has void signature + class String(Par): """Represents a string-typed parameter. @@ -197,6 +200,9 @@ def fc_return(self) -> str: ' *{0}_size = result.size();\n' ' return;\n').format(self.name) + def fc_return_default(self) -> str: + return '' # memfun has void signature + class VecDbl(Par): """Represents a vector of double parameter. @@ -263,6 +269,9 @@ def fc_return(self) -> str: ' *{0}_size = result.size();\n' ' return;\n').format(self.name) + def fc_return_default(self) -> str: + return '' # memfun has void signature + class Vec2Dbl(Par): """Represents a vector of vector of double parameter. @@ -350,6 +359,9 @@ def fc_return(self) -> str: return textwrap.indent(result.format(self.name), ' ') + def fc_return_default(self) -> str: + return '' # memfun has void signature + class VecSizet(Par): """Represents a vector of size_t parameter. @@ -416,6 +428,9 @@ def fc_return(self) -> str: ' *{0}_size = result.size();\n' ' return;\n').format(self.name) + def fc_return_default(self) -> str: + return '' # memfun has void signature + class Array(Par): def __init__( @@ -557,6 +572,9 @@ def fc_return(self) -> str: self.ndims), ' ') + def fc_return_default(self) -> str: + return '' # memfun has void signature + def _f_dims(self) -> str: return ', '.join([':'] * self.ndims) @@ -624,6 +642,9 @@ def fc_return(self) -> str: ' *{0}_size = result.size();\n' ' return;\n').format(self.name) + def fc_return_default(self) -> str: + return '' # memfun has void signature + class Obj(Par): """Represents an object of a type to pass. @@ -686,6 +707,9 @@ def f_call_c(self, result_name: str, call: str) -> str: def f_return_result(self, return_name: str, result_name: str) -> str: return ' {}%ptr = {}\n'.format(return_name, result_name) + def fc_return_default(self) -> str: + return ' return 0;\n' + class Bool(Par): """Represents a bool-typed parameter. @@ -732,6 +756,9 @@ def f_call_c(self, result_name: str, call: str) -> str: def f_return_result(self, return_name: str, result_name: str) -> str: return ' {} = {}\n'.format(return_name, result_name) + def fc_return_default(self) -> str: + return ' return false;\n' + class EnumVal(Par): """Represents an enum-typed parameter. @@ -784,6 +811,9 @@ def fc_get_result(self, cpp_chain_call: str) -> str: def fc_return(self) -> str: return ' return static_cast(result);\n' + def fc_return_default(self) -> str: + return ' return 0;\n' + class Int(Par): """Represents an int-typed parameter. @@ -821,6 +851,9 @@ def fc_get_result(self, cpp_chain_call: str) -> str: def fc_return(self) -> str: return ' return result;\n' + def fc_return_default(self) -> str: + return ' return 0;\n' + class Char(Par): """Represents an char-typed parameter. @@ -859,6 +892,9 @@ def fc_get_result(self, cpp_chain_call: str) -> str: def fc_return(self) -> str: return ' return result;\n' + def fc_return_default(self) -> str: + return ' return 0;\n' + class Int16t(Par): """Represents an int16_t-typed parameter. @@ -896,6 +932,9 @@ def fc_get_result(self, cpp_chain_call: str) -> str: def fc_return(self) -> str: return ' return result;\n' + def fc_return_default(self) -> str: + return ' return 0;\n' + class Int32t(Par): """Represents an int32_t-typed parameter. @@ -934,6 +973,9 @@ def fc_get_result(self, cpp_chain_call: str) -> str: def fc_return(self) -> str: return ' return result;\n' + def fc_return_default(self) -> str: + return ' return 0;\n' + class Int64t(Par): """Represents an int64_t-typed parameter. @@ -971,6 +1013,9 @@ def fc_get_result(self, cpp_chain_call: str) -> str: def fc_return(self) -> str: return ' return result;\n' + def fc_return_default(self) -> str: + return ' return 0;\n' + class Sizet(Par): """Represents an size_t-typed parameter. @@ -1009,6 +1054,9 @@ def fc_get_result(self, cpp_chain_call: str) -> str: def fc_return(self) -> str: return ' return result;\n' + def fc_return_default(self) -> str: + return ' return 0;\n' + class Float(Par): """Represents a single precision float parameter. @@ -1047,6 +1095,9 @@ def fc_get_result(self, cpp_chain_call: str) -> str: def fc_return(self) -> str: return ' return result;\n' + def fc_return_default(self) -> str: + return ' return 0.0;\n' + class Double(Par): """Represents a double precision float parameter. @@ -1085,6 +1136,9 @@ def fc_get_result(self, cpp_chain_call: str) -> str: def fc_return(self) -> str: return ' return result;\n' + def fc_return_default(self) -> str: + return ' return 0.0;\n' + class T(Par): """Represents a template dummy type. @@ -1227,6 +1281,7 @@ def fortran_c_wrapper(self) -> str: catch += ' *err_msg_len = msg.size();\n' catch += '}\n' result += textwrap.indent(catch, 4*' ') + result += self._fc_return_default() else: result += self._fc_cpp_call() result += self._fc_return() @@ -1436,6 +1491,9 @@ def _fc_cpp_call(self) -> str: def _fc_return(self) -> str: return self.ret_type.fc_return() + def _fc_return_default(self) -> str: + return self.ret_type.fc_return_default() + def _fc_in_parameters(self) -> List[str]: """Create a list of input parameters. """ diff --git a/scripts/make_libmuscle_api.py b/scripts/make_libmuscle_api.py index 33a17a52..0b083a2e 100755 --- a/scripts/make_libmuscle_api.py +++ b/scripts/make_libmuscle_api.py @@ -451,6 +451,7 @@ def __copy__(self) -> 'Elements': ' *err_msg = const_cast(msg.data());\n' ' *err_msg_len = msg.size();\n' ' }\n' + ' return 0;\n' '}\n\n') ), MemFun(Obj('DataConstRef', 'value'), 'get_item_by_index', [Sizet('i')], True, @@ -480,6 +481,7 @@ def __copy__(self) -> 'Elements': ' *err_msg = const_cast(msg.data());\n' ' *err_msg_len = msg.size();\n' ' }\n' + ' return 0;\n' '}\n\n') ), OverloadSet('get_item', [ @@ -503,6 +505,7 @@ def __copy__(self) -> 'Elements': ' *err_msg = const_cast(msg.data());\n' ' *err_msg_len = msg.size();\n' ' }\n' + ' return 0;\n' '}\n\n') ), MemFun(VecSizet('shp'), 'shape', [], True), @@ -623,6 +626,7 @@ def __copy__(self) -> 'Elements': ' *err_msg = const_cast(msg.data());\n' ' *err_msg_len = msg.size();\n' ' }\n' + ' return 0;\n' '}\n\n') ), MemFun(Obj('Data', 'value'), 'get_item_by_index', [Sizet('i')], True, @@ -652,6 +656,7 @@ def __copy__(self) -> 'Elements': ' *err_msg = const_cast(msg.data());\n' ' *err_msg_len = msg.size();\n' ' }\n' + ' return 0;\n' '}\n\n') ), OverloadSet('get_item', [ @@ -743,6 +748,7 @@ def __copy__(self) -> 'Elements': ' *err_msg = const_cast(msg.data());\n' ' *err_msg_len = msg.size();\n' ' }\n' + ' return 0;\n' '}\n\n') ), ]) From 272617d656600ecbe1d78473a14375730fd2c0be Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 3 Nov 2022 11:34:08 +0100 Subject: [PATCH 064/183] Fix some compiler warnings - Add virtual destructor for TcpTransportServer (fixes a -Wdelete-abstract-non-virtual-dtor) - Add std::move to avoid copying data (fixes a few -Wreturn-std-move) --- libmuscle/cpp/src/libmuscle/data.cpp | 2 +- libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp | 2 +- libmuscle/cpp/src/libmuscle/mcp/transport_server.hpp | 4 ++++ libmuscle/cpp/src/libmuscle/mpp_message.cpp | 2 +- 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/libmuscle/cpp/src/libmuscle/data.cpp b/libmuscle/cpp/src/libmuscle/data.cpp index 53b57694..666a8423 100644 --- a/libmuscle/cpp/src/libmuscle/data.cpp +++ b/libmuscle/cpp/src/libmuscle/data.cpp @@ -962,7 +962,7 @@ DataConstRef DataConstRef::grid_data_( Data result = Data::byte_array(num_elems); char * data_copy = result.as_byte_array(); std::copy(data, data + num_elems, data_copy); - return result; + return std::move(result); } } diff --git a/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp b/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp index 959737d2..c0e95b90 100644 --- a/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp +++ b/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp @@ -129,7 +129,7 @@ DataConstRef TcpTransportClient::call( int64_t length = recv_int64(socket_fd_); auto result = Data::byte_array(length); recv_all(socket_fd_, result.as_byte_array(), result.size()); - return result; + return std::move(result); } void TcpTransportClient::close() { diff --git a/libmuscle/cpp/src/libmuscle/mcp/transport_server.hpp b/libmuscle/cpp/src/libmuscle/mcp/transport_server.hpp index c37a61d0..32ada05d 100644 --- a/libmuscle/cpp/src/libmuscle/mcp/transport_server.hpp +++ b/libmuscle/cpp/src/libmuscle/mcp/transport_server.hpp @@ -66,6 +66,10 @@ class TransportServer { */ TransportServer(RequestHandler & handler); + /** Destroy the Transport Server object + */ + virtual ~TransportServer() = default; + /** Returns the location this server listens on. * * @return A string containing the location. diff --git a/libmuscle/cpp/src/libmuscle/mpp_message.cpp b/libmuscle/cpp/src/libmuscle/mpp_message.cpp index 2962e31c..5f796224 100644 --- a/libmuscle/cpp/src/libmuscle/mpp_message.cpp +++ b/libmuscle/cpp/src/libmuscle/mpp_message.cpp @@ -76,7 +76,7 @@ DataConstRef MPPMessage::encoded() const { auto bytes = Data::byte_array(sbuf.size()); memcpy(bytes.as_byte_array(), sbuf.data(), sbuf.size()); - return bytes; + return std::move(bytes); } } } From b4d74c79e0832ae43caa4a012d94b5224bb1e891 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 7 Nov 2022 16:01:01 +0100 Subject: [PATCH 065/183] Fixes #126. See issue for more details. --- .../examples/rd_implementations.ymmsl.in | 20 +++++++++---------- integration_test/test_start_all.py | 2 +- integration_test/test_start_mpi.py | 4 ++-- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/source/examples/rd_implementations.ymmsl.in b/docs/source/examples/rd_implementations.ymmsl.in index c6cef229..4f2b0c7b 100644 --- a/docs/source/examples/rd_implementations.ymmsl.in +++ b/docs/source/examples/rd_implementations.ymmsl.in @@ -8,23 +8,23 @@ implementations: reaction_cpp: env: - +LD_LIBRARY_PATH: MUSCLE3_HOME/lib + +LD_LIBRARY_PATH: :MUSCLE3_HOME/lib executable: MUSCLE3_EXAMPLES/cpp/build/reaction reaction_cpp_mpi: env: - +LD_LIBRARY_PATH: MUSCLE3_HOME/lib + +LD_LIBRARY_PATH: :MUSCLE3_HOME/lib executable: MUSCLE3_EXAMPLES/cpp/build/reaction_mpi execution_model: openmpi reaction_fortran: env: - +LD_LIBRARY_PATH: MUSCLE3_HOME/lib + +LD_LIBRARY_PATH: :MUSCLE3_HOME/lib executable: MUSCLE3_EXAMPLES/fortran/build/reaction reaction_fortran_mpi: env: - +LD_LIBRARY_PATH: MUSCLE3_HOME/lib + +LD_LIBRARY_PATH: :MUSCLE3_HOME/lib executable: MUSCLE3_EXAMPLES/fortran/build/reaction_mpi execution_model: openmpi @@ -35,30 +35,30 @@ implementations: diffusion_cpp: env: - +LD_LIBRARY_PATH: MUSCLE3_HOME/lib + +LD_LIBRARY_PATH: :MUSCLE3_HOME/lib executable: MUSCLE3_EXAMPLES/cpp/build/diffusion diffusion_fortran: env: - +LD_LIBRARY_PATH: MUSCLE3_HOME/lib + +LD_LIBRARY_PATH: :MUSCLE3_HOME/lib executable: MUSCLE3_EXAMPLES/fortran/build/diffusion mc_driver_cpp: env: - +LD_LIBRARY_PATH: MUSCLE3_HOME/lib + +LD_LIBRARY_PATH: :MUSCLE3_HOME/lib executable: MUSCLE3_EXAMPLES/cpp/build/mc_driver mc_driver_fortran: env: - +LD_LIBRARY_PATH: MUSCLE3_HOME/lib + +LD_LIBRARY_PATH: :MUSCLE3_HOME/lib executable: MUSCLE3_EXAMPLES/fortran/build/mc_driver load_balancer_cpp: env: - +LD_LIBRARY_PATH: MUSCLE3_HOME/lib + +LD_LIBRARY_PATH: :MUSCLE3_HOME/lib executable: MUSCLE3_EXAMPLES/cpp/build/load_balancer load_balancer_fortran: env: - +LD_LIBRARY_PATH: MUSCLE3_HOME/lib + +LD_LIBRARY_PATH: :MUSCLE3_HOME/lib executable: MUSCLE3_EXAMPLES/fortran/build/load_balancer diff --git a/integration_test/test_start_all.py b/integration_test/test_start_all.py index bb8d03b2..e2cb813b 100644 --- a/integration_test/test_start_all.py +++ b/integration_test/test_start_all.py @@ -42,7 +42,7 @@ def test_start_all(tmpdir): 'implementations:\n' ' component:\n' ' env:\n' - ' +LD_LIBRARY_PATH: {}\n' + ' +LD_LIBRARY_PATH: :{}\n' ' executable: {}\n' 'resources:\n' ' macro:\n' diff --git a/integration_test/test_start_mpi.py b/integration_test/test_start_mpi.py index 233cd210..dca5c9d2 100644 --- a/integration_test/test_start_mpi.py +++ b/integration_test/test_start_mpi.py @@ -49,11 +49,11 @@ def test_start_mpi(tmpdir): 'implementations:\n' ' component:\n' ' env:\n' - ' +LD_LIBRARY_PATH: {}\n' + ' +LD_LIBRARY_PATH: :{}\n' ' executable: {}\n' ' mpi_component:\n' ' env:\n' - ' +LD_LIBRARY_PATH: {}\n' + ' +LD_LIBRARY_PATH: :{}\n' ' executable: {}\n' ' execution_model: openmpi\n' 'resources:\n' From 6587894ad11df1679aa9f974e0758aa88255b9f6 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Wed, 9 Nov 2022 11:41:48 +0100 Subject: [PATCH 066/183] Fix missing function call Nice catch of mypy! --- libmuscle/python/libmuscle/instance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index 513018d6..ae952111 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -502,13 +502,13 @@ def __receive_message( else: msg = self._communicator.receive_message( port_name, slot, default) - if port.is_connected and not port.is_open(slot): + if port.is_connected() and not port.is_open(slot): err_msg = (('Port {} was closed while trying to' ' receive on it, did the peer crash?' ).format(port_name)) self.__shutdown(err_msg) raise RuntimeError(err_msg) - if port.is_connected and not with_settings: + if port.is_connected() and not with_settings: self.__check_compatibility(port_name, msg.settings) if not with_settings: msg.settings = None From 50251724db1c060bc226f9f96f1b0486a93d601d Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 10 Nov 2022 14:25:44 +0100 Subject: [PATCH 067/183] Update call signatures to new design Skip failing unit tests. --- integration_test/test_snapshot_macro_micro.py | 58 ++++++++-------- .../python/libmuscle/checkpoint_triggers.py | 23 ++----- libmuscle/python/libmuscle/instance.py | 68 ++++++++----------- .../python/libmuscle/snapshot_manager.py | 23 +++++-- .../test/test_checkpoint_triggers.py | 2 + .../libmuscle/test/test_snapshot_manager.py | 2 + 6 files changed, 86 insertions(+), 90 deletions(-) diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py index 93427098..ae657b5b 100644 --- a/integration_test/test_snapshot_macro_micro.py +++ b/integration_test/test_snapshot_macro_micro.py @@ -17,26 +17,24 @@ def macro(): Operator.S: ['s']}) while instance.reuse_instance(): - t_cur = instance.get_setting('t0', 'float') dt = instance.get_setting('dt', 'float') t_max = instance.get_setting('t_max', 'float') if instance.resuming(): msg = instance.load_snapshot() + # load state from message t_cur = msg.timestamp - assert msg.next_timestamp == pytest.approx(t_cur + dt) i = msg.data - assert i >= 0 - else: + assert i >= 1 + + if instance.should_init(): + t_cur = instance.get_setting('t0', 'float') i = 0 while t_cur + dt <= t_max: t_next = t_cur + dt - - if instance.should_save_snapshot(t_cur, t_next): - instance.save_snapshot(Message(t_cur, t_next, i)) - - t_next = None if t_next + dt > t_max else t_next + if t_next + dt > t_max: + t_next = None # final iteration of this time-integration loop instance.send('o_i', Message(t_cur, t_next, i)) msg = instance.receive('s') @@ -45,6 +43,9 @@ def macro(): i += 1 t_cur += dt + if instance.should_save_snapshot(t_cur): + instance.save_snapshot(Message(t_cur, None, i)) + if instance.should_save_final_snapshot(t_cur): instance.save_final_snapshot(Message(t_cur, None, i)) @@ -55,26 +56,24 @@ def macro_vector(): Operator.S: ['s[]']}) while instance.reuse_instance(): - t_cur = instance.get_setting('t0', 'float') dt = instance.get_setting('dt', 'float') t_max = instance.get_setting('t_max', 'float') if instance.resuming(): msg = instance.load_snapshot() + # load state from message t_cur = msg.timestamp - assert msg.next_timestamp == pytest.approx(t_cur + dt) i = msg.data - assert i >= 0 - else: + assert i >= 1 + + if instance.should_init(): + t_cur = instance.get_setting('t0', 'float') i = 0 while t_cur + dt <= t_max: t_next = t_cur + dt - - if instance.should_save_snapshot(t_cur, t_next): - instance.save_snapshot(Message(t_cur, t_next, i)) - - t_next = None if t_next + dt > t_max else t_next + if t_next + dt > t_max: + t_next = None # final iteration of this time-integration loop for slot in range(instance.get_port_length('o_i')): instance.send('o_i', Message(t_cur, t_next, i), slot) @@ -85,7 +84,10 @@ def macro_vector(): i += 1 t_cur += dt - if instance.should_save_final_snapshot(t_cur): + if instance.should_save_snapshot(t_cur): + instance.save_snapshot(Message(t_cur, None, i)) + + if instance.should_save_final_snapshot(): instance.save_final_snapshot(Message(t_cur, None, i)) @@ -102,25 +104,25 @@ def micro(): msg = instance.load_snapshot() t_cur = msg.timestamp i, t_stop = msg.data - else: + + if instance.should_init(): msg = instance.receive('f_i') t_cur = msg.timestamp i = msg.data t_stop = t_cur + t_max while t_cur < t_stop: - t_next = t_cur + dt - - if instance.should_save_snapshot(t_cur, t_next): - instance.save_snapshot(Message(t_cur, t_next, [i, t_stop])) - + # faux time-integration for testing snapshots t_cur += dt - if instance.should_save_final_snapshot(t_cur): - instance.save_final_snapshot(Message(t_cur, None, [i, t_stop])) + if instance.should_save_snapshot(t_cur): + instance.save_snapshot(Message(t_cur, None, [i, t_stop])) instance.send('o_f', Message(t_cur, None, i)) + if instance.should_save_final_snapshot(): + instance.save_final_snapshot(Message(t_cur, None, [i, t_stop])) + @pytest.fixture def base_config(): @@ -163,6 +165,7 @@ def base_config(): - every: 0.4""") +@pytest.mark.skip("To be updated") def test_snapshot_macro_micro(tmp_path, base_config): base_config.check_consistent() run_dir1 = RunDir(tmp_path / 'run1') @@ -203,6 +206,7 @@ def test_snapshot_macro_micro(tmp_path, base_config): assert len(snapshots_ymmsl) == 2 +@pytest.mark.skip("To be updated") def test_snapshot_macro_vector_micro(tmp_path, base_config): macro_implementation = base_config.implementations['macro_implementation'] macro_implementation.args[-1] = 'macro_vector' diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py index 88d47553..b134f76d 100644 --- a/libmuscle/python/libmuscle/checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/checkpoint_triggers.py @@ -215,8 +215,7 @@ def elapsed_walltime(self) -> float: """ return time.monotonic() - self._monotonic_reference - def should_save_snapshot(self, timestamp: float, - next_timestamp: Optional[float]) -> bool: + def should_save_snapshot(self, timestamp: float) -> bool: """Handles instance.should_save_snapshot """ if self._should_have_saved: @@ -224,19 +223,12 @@ def should_save_snapshot(self, timestamp: float, '"should_save_final_snapshot" returned positive' ' but no snapshot was saved before the next call') - value = False elapsed_walltime = self.elapsed_walltime() - if next_timestamp is None: - _logger.warning('No "next_timestamp" provided. Workflow may not' - ' be able to create a consistent snapshot. See ' - 'https://muscle3.readthedocs.io/en/latest/checkpoints.html') - value = self.__should_save(elapsed_walltime, timestamp) - else: - value = self.__should_save(elapsed_walltime, next_timestamp) + value = self.__should_save(elapsed_walltime, timestamp) self._should_have_saved = value return value - def should_save_final_snapshot(self, timestamp: float) -> bool: + def should_save_final_snapshot(self) -> bool: """Handles instance.should_save_final_snapshot """ if self._should_have_saved: @@ -285,9 +277,7 @@ def reuse_instance(self, max_f_init_next_timestamp: Optional[float] self._should_save_final_called = False self._saved_final_checkpoint = False - def update_checkpoints(self, timestamp: float, - next_timestamp: Optional[float], final: bool - ) -> None: + def update_checkpoints(self, timestamp: float, final: bool) -> None: """Update last and next checkpoint times when a snapshot is made Args: @@ -300,10 +290,7 @@ def update_checkpoints(self, timestamp: float, if final and self._max_f_init_next_timestamp is not None: simulation_time = self._max_f_init_next_timestamp else: - if next_timestamp is None: - simulation_time = timestamp - else: - simulation_time = next_timestamp + simulation_time = timestamp self._prevsim = simulation_time self._nextsim = self._sim.next_checkpoint(simulation_time) diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index e31ab594..755503bc 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -430,7 +430,7 @@ def resuming(self) -> bool: This method returns True for the first iteration of the reuse loop after resuming from a previously taken snapshot. When resuming from a snapshot, the submodel must load its state from the snapshot as returned - by :meth:`load_snapshot` and the F_INIT step must be skipped. + by :meth:`load_snapshot`. Returns: True iff the submodel must resume from a snapshot instead of the @@ -438,6 +438,20 @@ def resuming(self) -> bool: """ return self._snapshot_manager.resuming() + def should_init(self) -> bool: + """Check if this instance should initialize. + + Must be used by submodels that implement the checkpointing API. + + When resuming from a previous snapshot, instances need not always + execute the F_INIT phase of the submodel execution loop. Use this method + before attempting to receive data on F_INIT ports. + + Returns: + True iff the submodel must skip the F_INIT step + """ + return self._snapshot_manager.should_init() + def load_snapshot(self) -> Message: """Load a snapshot. @@ -452,42 +466,27 @@ def load_snapshot(self) -> Message: """ return self._snapshot_manager.load_snapshot() - def should_save_snapshot( - self, timestamp: float, next_timestamp: Optional[float]) -> bool: + def should_save_snapshot(self, timestamp: float) -> bool: """Check if a snapshot should be saved inside a time-integration loop. This method checks if a snapshot should be saved right now, based on the - provided timestamps and passed wallclock time. - - When the next timestamp is provided, this value will be used to - determine if a checkpoint will be passed between now and the next time - step. A submodel should always provide the next timestamp if available, - since this is the most reliable way to get consistent snapshots across - all submodels in the run. - - When a submodel cannot provide the next timestamp, a best efford is made - to get consistent snapshots (based on the current timestamp). See the - checkpointing tutorial for more information. + provided timestamp and passed wallclock time. When this method returns True, the submodel must also save a snapshot through :meth:`save_snapshot`. A RuntimeError will be generated when not doing so. See also :meth:`should_save_final_snapshot` for the variant that must be - called at the end of a time-integration loop, or when a submodel does - not have a time-integration loop. + called at the end of the reuse loop. Args: timestamp: current timestamp of the submodel - next_timestamp: timestamp of the next iteration of the time - integration loop of the submodel or ``None`` if not available Returns: True iff a snapshot should be taken by the submodel according to the checkpoint rules provided in the ymmsl configuration. """ - return self._snapshot_manager.should_save_snapshot( - timestamp, next_timestamp) + return self._snapshot_manager.should_save_snapshot(timestamp) def save_snapshot(self, message: Message) -> None: """Save a snapshot inside a time-integration loop. @@ -495,8 +494,8 @@ def save_snapshot(self, message: Message) -> None: Before saving a snapshot, you should check using :meth:`should_save_snapshot` if a snapshot should be saved according to the checkpoint rules specified in the ymmsl configuration. You should - use the same timestamp and next_timestamp in the provided Message object - as used to query `should_save_snapshot`. + use the same timestamp in the provided Message object as used to query + `should_save_snapshot`. Although it is allowed to save a snapshot even when :meth:`should_save_snapshot` returns False, you should avoid this: this @@ -505,18 +504,17 @@ def save_snapshot(self, message: Message) -> None: It could also lead to a lot of snapshot files clogging your file system. See also :meth:`save_final_snapshot` for the variant that must be called - at the end of a time-integration loop, or when a submodel does not have - a time-integration loop. + at the end of the reuse loop. Args: message: Message object that is saved as snapshot. The message - timestamp and next_timestamp attributes should be the same as - passed to :meth:`should_save_snapshot`. The data attribute can - be used to store the internal state of the submodel. + timestamp attribute should be the same as passed to + :meth:`should_save_snapshot`. The data attribute can be used to + store the internal state of the submodel. """ return self._snapshot_manager.save_snapshot(message) - def should_save_final_snapshot(self, timestamp: float) -> bool: + def should_save_final_snapshot(self) -> bool: """Check if a snapshot should be saved before O_F. This method checks if a snapshot should be saved right now, based on the @@ -529,14 +527,11 @@ def should_save_final_snapshot(self, timestamp: float) -> bool: See also :meth:`should_save_snapshot` for the variant that may be called inside of a time-integration loop of the submodel. - Args: - timestamp: current timestamp of the submodel - Returns: True iff a final snapshot should be taken by the submodel according to the checkpoint rules provided in the ymmsl configuration. """ - return self._snapshot_manager.should_save_final_snapshot(timestamp) + return self._snapshot_manager.should_save_final_snapshot() def save_final_snapshot(self, message: Message) -> None: """Save a snapshot before O_F. @@ -544,8 +539,6 @@ def save_final_snapshot(self, message: Message) -> None: Before saving a snapshot, you should check using :meth:`should_save_final_snapshot` if a snapshot should be saved according to the checkpoint rules specified in the ymmsl configuration. - You should use the same timestamp in the provided Message object as used - to query `should_save_final_snapshot`. Although it is allowed to save a snapshot even when :meth:`should_save_final_snapshot` returns False, you should avoid this: @@ -557,10 +550,9 @@ def save_final_snapshot(self, message: Message) -> None: of a time-integration loop of the submodel. Args: - message: Message object that is saved as snapshot. The message - timestamp should be the same as passed to - :meth:`should_save_snapshot`. The data attribute can be used to - store the internal state of the submodel. + message: Message object that is saved as snapshot. The data + attribute can be used to store the internal state of the + submodel. """ return self._snapshot_manager.save_final_snapshot(message) diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py index 10f2c9fc..aa9f5dc6 100644 --- a/libmuscle/python/libmuscle/snapshot_manager.py +++ b/libmuscle/python/libmuscle/snapshot_manager.py @@ -70,7 +70,6 @@ def set_checkpoint_info(self, if self._trigger: self._trigger.update_checkpoints( snapshot.message.timestamp, - snapshot.message.next_timestamp, snapshot.is_final_snapshot) def reuse_instance(self, @@ -99,6 +98,17 @@ def resuming(self) -> bool: """ return self._resume_from_snapshot is not None + def should_init(self) -> bool: + """Check if F_INIT should be run in this reuse loop. + + Returns: + True: when not resuming this reuse loop, or when resuming from a + final snapshot. + False: otherwise + """ + return (self._resume_from_snapshot is None or + self._resume_from_snapshot.is_final_snapshot) + def load_snapshot(self) -> Message: """Get the Message to resume from """ @@ -107,20 +117,19 @@ def load_snapshot(self) -> Message: ' to check if a snapshot is available') return self._resume_from_snapshot.message - def should_save_snapshot(self, timestamp: float, - next_timestamp: Optional[float]) -> bool: + def should_save_snapshot(self, timestamp: float) -> bool: """See :meth:`TriggerManager.should_save_snapshot` """ if self._trigger is None: return False # checkpointing disabled - return self._trigger.should_save_snapshot(timestamp, next_timestamp) + return self._trigger.should_save_snapshot(timestamp) - def should_save_final_snapshot(self, timestamp: float) -> bool: + def should_save_final_snapshot(self) -> bool: """See :meth:`TriggerManager.should_save_final_snapshot` """ if self._trigger is None: return False # checkpointing disabled - return self._trigger.should_save_final_snapshot(timestamp) + return self._trigger.should_save_final_snapshot() def save_snapshot(self, msg: Message) -> None: """Save snapshot contained in the message object. @@ -158,7 +167,7 @@ def __save_snapshot(self, msg: Message, final: bool) -> None: if self._trigger is not None: self._trigger.update_checkpoints( - msg.timestamp, msg.next_timestamp, final) + msg.timestamp, final) def __load_snapshot(self, snapshot_location: Path) -> None: """Load a previously stored snapshot from the filesystem diff --git a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py index baf0c2c1..873f79d8 100644 --- a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py @@ -147,6 +147,7 @@ def test_trigger_manager_reference_time(): assert 15.0 < elapsed_walltime <= (15.0 + elapsed_monotonic) +@pytest.mark.skip("To be updated") def test_trigger_manager(): reference = datetime.now(timezone.utc) trigger_manager = TriggerManager(reference, Checkpoints( @@ -196,6 +197,7 @@ def test_trigger_manager(): trigger_manager.reuse_instance(None) +@pytest.mark.skip("To be updated") def test_trigger_manager_warnings(caplog: pytest.LogCaptureFixture, monkeypatch: pytest.MonkeyPatch): monkeypatch.setenv("MUSCLE_DISABLE_CHECKPOINT_ERRORS", "1") diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py index f1c18ec8..bfee09a1 100644 --- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py +++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py @@ -11,6 +11,7 @@ from libmuscle.snapshot_manager import SnapshotManager +@pytest.mark.skip("To be updated") def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path ) -> None: manager = MagicMock() @@ -33,6 +34,7 @@ def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path assert "no checkpoints" in caplog.records[0].message +@pytest.mark.skip("To be updated") def test_save_load_checkpoint(tmp_path: Path) -> None: manager = MagicMock() communicator = MagicMock() From 60b8aa961006f79acb0fe3c7f16710b6a0cfecaa Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 14 Nov 2022 13:30:52 +0100 Subject: [PATCH 068/183] Refactor reuse_instance logic - Execute as part of should_save_final_snapshot - Check in Instance.reuse_instance if already called & call otherwise --- .../python/libmuscle/checkpoint_triggers.py | 9 +- libmuscle/python/libmuscle/instance.py | 117 +++++++++++------- .../python/libmuscle/snapshot_manager.py | 16 ++- 3 files changed, 83 insertions(+), 59 deletions(-) diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py index b134f76d..51bd7848 100644 --- a/libmuscle/python/libmuscle/checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/checkpoint_triggers.py @@ -252,16 +252,9 @@ def should_save_final_snapshot(self) -> bool: self._should_save_final_called = True return value - def reuse_instance(self, max_f_init_next_timestamp: Optional[float] - ) -> None: + def reuse_instance(self) -> None: """Cleanup between instance reuse - - Args: - max_f_init_next_timestamp: the maximum next_timestamp of all - messages pre--received during F_INIT. """ - self._max_f_init_next_timestamp = max_f_init_next_timestamp - if self._first_reuse: self._first_reuse = False else: diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index 755503bc..7e2559d9 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -74,6 +74,8 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None self._first_run = True """Keeps track of whether this is the first reuse run.""" + self._do_reuse = None # type: Optional[bool] + """Caching variable for result from :meth:`__check_reuse_instance`""" self._f_init_cache = dict() # type: _FInitCacheType @@ -129,37 +131,12 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool: :meth:`should_save_final_snapshot` and :meth:`save_final_snapshot`, or the checkpointing tutorial. """ - do_reuse = self.__receive_settings() - - # TODO: _f_init_cache should be empty here, or the user didn't - # receive something that was sent on the last go-around. - # At least emit a warning. - if not (self.resuming() and self._first_run): - # when resuming we skip receiving on f_init in the first run - self.__pre_receive_f_init(apply_overlay) - - self._set_local_log_level() - self._set_remote_log_level() - - ports = self._communicator.list_ports() - f_init_not_connected = all( - [not self.is_connected(port) - for port in ports.get(Operator.F_INIT, [])]) - no_settings_in = not self._communicator.settings_in_connected() - - if f_init_not_connected and no_settings_in: - do_reuse = self._first_run - else: - for message in self._f_init_cache.values(): - if isinstance(message.data, ClosePort): - do_reuse = False - self._first_run = False + do_reuse = self._do_reuse + if do_reuse is None: + # should_save_final_snapshot not called, so we need to check_reuse + do_reuse = self.__check_reuse_instance(apply_overlay) + self._do_reuse = None - max_f_init_next_timestamp = max( - (msg.next_timestamp - for msg in self._f_init_cache.values() - if msg.next_timestamp is not None), - default=None) # Note: muscle_snapshot_directory setting is provided by muscle_manager # when checkpointing is enabled for this run. When checkpointing is not # enabled, it might not exist and a KeyError is raised. @@ -168,14 +145,8 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool: snapshot_path = Path(snapshot_dir) except KeyError: snapshot_path = None - self._snapshot_manager.reuse_instance( - max_f_init_next_timestamp, snapshot_path) + self._snapshot_manager.reuse_instance(snapshot_path) - if not do_reuse: - self.__close_ports() - self._communicator.shutdown() - self._deregister() - self.__manager.close() return do_reuse def error_shutdown(self, message: str) -> None: @@ -514,7 +485,7 @@ def save_snapshot(self, message: Message) -> None: """ return self._snapshot_manager.save_snapshot(message) - def should_save_final_snapshot(self) -> bool: + def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool: """Check if a snapshot should be saved before O_F. This method checks if a snapshot should be saved right now, based on the @@ -527,11 +498,32 @@ def should_save_final_snapshot(self) -> bool: See also :meth:`should_save_snapshot` for the variant that may be called inside of a time-integration loop of the submodel. + .. note:: + This method will block until it can determine whether a final + snapshot should be taken. This means it must also determine if this + instance is reused. The optional keword-only argument + `apply_overlay` has the same meaning as for :meth:`reuse_instance`. + + Args: + apply_overlay: Whether to apply the received settings + overlay or to save it. If you're going to use + :meth:`receive_with_settings` on your F_INIT ports, set this to + False. If you don't know what that means, just call + `reuse_instance()` without specifying this and everything will + be fine. If it turns out that you did need to specify False, + MUSCLE3 will tell you about it in an error message and you can + add it still. + Returns: True iff a final snapshot should be taken by the submodel according to the checkpoint rules provided in the ymmsl configuration. """ - return self._snapshot_manager.should_save_final_snapshot() + self._do_reuse = self.__check_reuse_instance(apply_overlay) + f_init_max_timestamp = max( + (msg.timestamp for msg in self._f_init_cache.values()), + default=None) + return self._snapshot_manager.should_save_final_snapshot( + self._do_reuse, f_init_max_timestamp) def save_final_snapshot(self, message: Message) -> None: """Save a snapshot before O_F. @@ -632,6 +624,46 @@ def __set_up_logging(self) -> None: self.__manager) logging.getLogger().addHandler(self._mmp_handler) + def __check_reuse_instance(self, apply_overlay: bool) -> bool: + """Pre-receive F_INIT messages and detect if this instance is reused. + + This is called during :meth:`should_save_final_snapshot` to detect if a + snapshot must be taken. If an instance does implement checkpointing, + :meth:`reuse_instance` will call it instead. + """ + do_reuse = self.__receive_settings() + + # TODO: _f_init_cache should be empty here, or the user didn't + # receive something that was sent on the last go-around. + # At least emit a warning. + if not (self.resuming() and self._first_run): + # when resuming we skip receiving on f_init in the first run + self.__pre_receive_f_init(apply_overlay) + + self._set_local_log_level() + self._set_remote_log_level() + + ports = self._communicator.list_ports() + f_init_not_connected = all( + [not self.is_connected(port) + for port in ports.get(Operator.F_INIT, [])]) + no_settings_in = not self._communicator.settings_in_connected() + + if f_init_not_connected and no_settings_in: + do_reuse = self._first_run + else: + for message in self._f_init_cache.values(): + if isinstance(message.data, ClosePort): + do_reuse = False + self._first_run = False + + if not do_reuse: + self.__close_ports() + self._communicator.shutdown() + self._deregister() + self.__manager.close() + return do_reuse + def __receive_message( self, port_name: str, slot: Optional[int], default: Optional[Message], with_settings: bool @@ -651,9 +683,10 @@ def __receive_message( if with_settings and msg.settings is None: err_msg = ('If you use receive_with_settings()' ' on an F_INIT port, then you have to' - ' pass False to reuse_instance(),' - ' otherwise the settings will already' - ' have been applied by MUSCLE.') + ' pass apply_overlay=False to reuse_instance() ' + ' and should_save_final_snapshot(),' + ' if applicable, otherwise the settings will' + ' already have been applied by MUSCLE.') self.__shutdown(err_msg) raise RuntimeError(err_msg) else: diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py index aa9f5dc6..e557070f 100644 --- a/libmuscle/python/libmuscle/snapshot_manager.py +++ b/libmuscle/python/libmuscle/snapshot_manager.py @@ -72,19 +72,14 @@ def set_checkpoint_info(self, snapshot.message.timestamp, snapshot.is_final_snapshot) - def reuse_instance(self, - max_f_init_next_timestamp: Optional[float], - snapshot_directory: Optional[Path], - ) -> None: + def reuse_instance(self, snapshot_directory: Optional[Path]) -> None: """Callback on Instance.reuse_instance Args: - max_f_init_next_timestamp: maximum next_timestamp of all F_INIT - messages. May be None if no message has next_timestamp set or - if no F_INIT messages were received. + snapshot_directory: Path to store this instance's snapshots in. """ if self._trigger is not None: - self._trigger.reuse_instance(max_f_init_next_timestamp) + self._trigger.reuse_instance() self._snapshot_directory = snapshot_directory @@ -124,7 +119,10 @@ def should_save_snapshot(self, timestamp: float) -> bool: return False # checkpointing disabled return self._trigger.should_save_snapshot(timestamp) - def should_save_final_snapshot(self) -> bool: + def should_save_final_snapshot( + self, do_reuse: bool, + f_init_max_timestamp: Optional[float] + ) -> bool: """See :meth:`TriggerManager.should_save_final_snapshot` """ if self._trigger is None: From 677693b4a49714719547effb754c50079094bcdc Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 15 Nov 2022 10:58:49 +0100 Subject: [PATCH 069/183] Refactor TriggerManager It is now always available on SnapshotManager and checks internally if checkpoints are defined. --- .../python/libmuscle/checkpoint_triggers.py | 33 +++++++++++++++++-- .../python/libmuscle/snapshot_manager.py | 33 +++++-------------- .../test/test_checkpoint_triggers.py | 9 +++-- 3 files changed, 45 insertions(+), 30 deletions(-) diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py index 51bd7848..ff12918b 100644 --- a/libmuscle/python/libmuscle/checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/checkpoint_triggers.py @@ -187,8 +187,20 @@ class TriggerManager: """Manages all checkpoint triggers and checks if a snapshot must be saved. """ - def __init__(self, utc_reference: datetime, checkpoints: Checkpoints - ) -> None: + def __init__(self) -> None: + self._has_checkpoints = False + self._last_triggers = [] # type: List[str] + self._monotonic_reference = time.monotonic() + + def set_checkpoint_info( + self, utc_reference: datetime, checkpoints: Checkpoints) -> None: + """Register checkpoint info received from the muscle manager. + """ + if not checkpoints: + self._has_checkpoints = False + return + + self._has_checkpoints = True self._monotonic_reference = _utc_to_monotonic(utc_reference) self._wall = CombinedCheckpointTriggers(checkpoints.wallclock_time) @@ -200,7 +212,6 @@ def __init__(self, utc_reference: datetime, checkpoints: Checkpoints self._nextsim = None # type: Optional[float] self._sim_reset = True - self._last_triggers = [] # type: List[str] self._first_reuse = True self._max_f_init_next_timestamp = None # type: Optional[float] @@ -218,6 +229,9 @@ def elapsed_walltime(self) -> float: def should_save_snapshot(self, timestamp: float) -> bool: """Handles instance.should_save_snapshot """ + if not self._has_checkpoints: + return False + if self._should_have_saved: _checkpoint_error('"should_save_snapshot" or ' '"should_save_final_snapshot" returned positive' @@ -231,6 +245,9 @@ def should_save_snapshot(self, timestamp: float) -> bool: def should_save_final_snapshot(self) -> bool: """Handles instance.should_save_final_snapshot """ + if not self._has_checkpoints: + return False + if self._should_have_saved: _checkpoint_error('"should_save_snapshot" or ' '"should_save_final_snapshot" returned positive' @@ -255,6 +272,8 @@ def should_save_final_snapshot(self) -> bool: def reuse_instance(self) -> None: """Cleanup between instance reuse """ + if not self._has_checkpoints: + return if self._first_reuse: self._first_reuse = False else: @@ -277,6 +296,14 @@ def update_checkpoints(self, timestamp: float, final: bool) -> None: timestamp: timestamp as reported by the instance next_timestamp: next timestamp as reported by the instance """ + if not self._has_checkpoints: + _logger.info('Saving a snapshot, but no snapshots requested by the' + ' workflow. Hint: use Instance.should_save_snapshot(),' + ' Instance.should_save_final_snapshot() or' + ' Instance.snapshots_enabled() to test if it is useful' + ' to save a snapshot.') + return + self._prevwall = self.elapsed_walltime() self._nextwall = self._wall.next_checkpoint(self._prevwall) diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py index e557070f..4061f092 100644 --- a/libmuscle/python/libmuscle/snapshot_manager.py +++ b/libmuscle/python/libmuscle/snapshot_manager.py @@ -41,8 +41,8 @@ def __init__(self, self._manager = manager self._first_reuse = True + self._trigger = TriggerManager() self._resume_from_snapshot = None # type: Optional[Snapshot] - self._trigger = None # type: Optional[TriggerManager] self._snapshot_directory = None # type: Optional[Path] self._next_snapshot_num = 1 @@ -60,17 +60,15 @@ def set_checkpoint_info(self, checkpoints: requested workflow checkpoints resume: previous snapshot to resume from (or None if not resuming) """ - if checkpoints: - self._trigger = TriggerManager(utc_reference, checkpoints) + self._trigger.set_checkpoint_info(utc_reference, checkpoints) if resume is not None: self.__load_snapshot(resume) snapshot = cast(Snapshot, self._resume_from_snapshot) self._communicator.restore_message_counts( snapshot.port_message_counts) - if self._trigger: - self._trigger.update_checkpoints( - snapshot.message.timestamp, - snapshot.is_final_snapshot) + self._trigger.update_checkpoints( + snapshot.message.timestamp, + snapshot.is_final_snapshot) def reuse_instance(self, snapshot_directory: Optional[Path]) -> None: """Callback on Instance.reuse_instance @@ -78,8 +76,7 @@ def reuse_instance(self, snapshot_directory: Optional[Path]) -> None: Args: snapshot_directory: Path to store this instance's snapshots in. """ - if self._trigger is not None: - self._trigger.reuse_instance() + self._trigger.reuse_instance() self._snapshot_directory = snapshot_directory @@ -115,8 +112,6 @@ def load_snapshot(self) -> Message: def should_save_snapshot(self, timestamp: float) -> bool: """See :meth:`TriggerManager.should_save_snapshot` """ - if self._trigger is None: - return False # checkpointing disabled return self._trigger.should_save_snapshot(timestamp) def should_save_final_snapshot( @@ -125,8 +120,6 @@ def should_save_final_snapshot( ) -> bool: """See :meth:`TriggerManager.should_save_final_snapshot` """ - if self._trigger is None: - return False # checkpointing disabled return self._trigger.should_save_final_snapshot() def save_snapshot(self, msg: Message) -> None: @@ -146,14 +139,8 @@ def __save_snapshot(self, msg: Message, final: bool) -> None: msg: message object representing the snapshot final: True iff called from save_final_snapshot """ - if self._trigger is None: - _logger.info('Saving a snapshot but no checkpoints requested' - ' by the workflow.') - triggers = [] - wallclock_time = 0.0 - else: - triggers = self._trigger.get_triggers() - wallclock_time = self._trigger.elapsed_walltime() + triggers = self._trigger.get_triggers() + wallclock_time = self._trigger.elapsed_walltime() port_message_counts = self._communicator.get_message_counts() snapshot = MsgPackSnapshot( @@ -163,9 +150,7 @@ def __save_snapshot(self, msg: Message, final: bool) -> None: metadata = SnapshotMetadata.from_snapshot(snapshot, str(path)) self._manager.submit_snapshot_metadata(self._instance_id, metadata) - if self._trigger is not None: - self._trigger.update_checkpoints( - msg.timestamp, final) + self._trigger.update_checkpoints(msg.timestamp, final) def __load_snapshot(self, snapshot_location: Path) -> None: """Load a previously stored snapshot from the filesystem diff --git a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py index 873f79d8..4f6eed0d 100644 --- a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py @@ -141,7 +141,8 @@ def test_trigger_manager_reference_time(): monotonic_now = time.monotonic() utcnow = datetime.now(timezone.utc) reference = utcnow - timedelta(seconds=15) - trigger_manager = TriggerManager(reference, Checkpoints()) + trigger_manager = TriggerManager() + trigger_manager.set_checkpoint_info(reference, Checkpoints(at_end=True)) elapsed_walltime = trigger_manager.elapsed_walltime() elapsed_monotonic = time.monotonic() - monotonic_now assert 15.0 < elapsed_walltime <= (15.0 + elapsed_monotonic) @@ -150,7 +151,8 @@ def test_trigger_manager_reference_time(): @pytest.mark.skip("To be updated") def test_trigger_manager(): reference = datetime.now(timezone.utc) - trigger_manager = TriggerManager(reference, Checkpoints( + trigger_manager = TriggerManager() + trigger_manager.set_checkpoint_info(reference, Checkpoints( wallclock_time=[CheckpointAtRule([1e-12])], simulation_time=[CheckpointAtRule([1, 3, 5])])) @@ -203,7 +205,8 @@ def test_trigger_manager_warnings(caplog: pytest.LogCaptureFixture, monkeypatch.setenv("MUSCLE_DISABLE_CHECKPOINT_ERRORS", "1") reference = datetime.now(timezone.utc) - trigger_manager = TriggerManager(reference, Checkpoints( + trigger_manager = TriggerManager() + trigger_manager.set_checkpoint_info(reference, Checkpoints( simulation_time=[CheckpointAtRule([1, 3, 5])])) trigger_manager.reuse_instance(2) From a8d81bb2969774fc81b270cb917d3f22d5ccbf99 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 15 Nov 2022 13:07:48 +0100 Subject: [PATCH 070/183] Process documentation comments --- docs/source/coupling.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/coupling.rst b/docs/source/coupling.rst index 8c764a97..5fccaf73 100644 --- a/docs/source/coupling.rst +++ b/docs/source/coupling.rst @@ -4,8 +4,8 @@ Coupling your model Multicast --------- -With MUSCLE3 you can connect an output port to multiple input ports. This is -called multicast. When a submodel sends a message on a port that is connected to +With MUSCLE3 you can connect an output port to multiple input ports. +When a submodel sends a message on a port that is connected to multiple input ports, the message is copied and sent to each connected port. .. note:: From 20826eeb279ddc9d02f356a4b4ee935038beebb8 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 15 Nov 2022 13:13:57 +0100 Subject: [PATCH 071/183] Move profiling of multicast messages out for-loop --- libmuscle/python/libmuscle/communicator.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py index cefd5e3b..7e9a7131 100644 --- a/libmuscle/python/libmuscle/communicator.py +++ b/libmuscle/python/libmuscle/communicator.py @@ -206,6 +206,8 @@ def send_message( return port = self._ports[port_name] + profile_event = self._profiler.start(ProfileEventType.SEND, port, + None, slot, None) recv_endpoints = self._peer_manager.get_peer_endpoints( snd_endpoint.port, slot_list) @@ -215,9 +217,6 @@ def send_message( port_length = self._ports[port_name].get_length() for recv_endpoint in recv_endpoints: - profile_event = self._profiler.start(ProfileEventType.SEND, port, - None, slot, None) - mcp_message = MPPMessage(snd_endpoint.ref(), recv_endpoint.ref(), port_length, message.timestamp, message.next_timestamp, @@ -226,10 +225,10 @@ def send_message( encoded_message = mcp_message.encoded() self._post_office.deposit(recv_endpoint.ref(), encoded_message) - profile_event.stop() - if port.is_vector(): - profile_event.port_length = port.get_length() - profile_event.message_size = len(encoded_message) + profile_event.stop() + if port.is_vector(): + profile_event.port_length = port.get_length() + profile_event.message_size = len(encoded_message) def receive_message(self, port_name: str, slot: Optional[int] = None, default: Optional[Message] = None From f0f974068cce1e44457519d38acd214cc80c6ee8 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 15 Nov 2022 13:16:06 +0100 Subject: [PATCH 072/183] Update docstring --- libmuscle/python/libmuscle/peer_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libmuscle/python/libmuscle/peer_manager.py b/libmuscle/python/libmuscle/peer_manager.py index 0a7600c0..a0c28c9a 100644 --- a/libmuscle/python/libmuscle/peer_manager.py +++ b/libmuscle/python/libmuscle/peer_manager.py @@ -91,14 +91,14 @@ def get_peer_locations(self, peer_instance: Reference) -> List[str]: def get_peer_endpoints(self, port: Identifier, slot: List[int] ) -> List[Endpoint]: - """Determine the peer endpoint for the given port and slot. + """Determine the peer endpoints for the given port and slot. Args: port: The port on our side to send or receive on. slot: The slot to send or receive on. Returns: - The peer endpoint. + The peer endpoints. """ peers = self.__peers[self.__kernel + port] endpoints = [] From e00e1eabd3f366b9aa7f2a185a62cd68cf5ac1eb Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Tue, 15 Nov 2022 13:59:27 +0100 Subject: [PATCH 073/183] Add clang to the CI --- .github/workflows/ci_ubuntu_22.04_clang.yaml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 .github/workflows/ci_ubuntu_22.04_clang.yaml diff --git a/.github/workflows/ci_ubuntu_22.04_clang.yaml b/.github/workflows/ci_ubuntu_22.04_clang.yaml new file mode 100644 index 00000000..125b3fe6 --- /dev/null +++ b/.github/workflows/ci_ubuntu_22.04_clang.yaml @@ -0,0 +1,20 @@ +# Run Continuous Integration for the latest Ubuntu release +# This mainly checks for issues/regressions in the native build +name: native_compatibility_ubuntu22.04_clang +on: + schedule: + - cron: '0 3 * * 0' + push: + branches: + - 'release-*' + - fix_native_compatibility_ci + - feature/clang_build +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + + - name: Run tests on Ubuntu 22.04 with Clang + run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:22.04 /bin/bash -c 'apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential clang cmake gfortran git valgrind libopenmpi-dev pkg-config python3 python3-pip python3-venv curl && apt-get -y remove libssl-dev && useradd -m -d /home/muscle3 muscle3 && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && su muscle3 -c -- "pip3 install -U pip setuptools wheel" && su muscle3 -c -- "pip3 install \"ymmsl>=0.12.0,<0.13\" qcg-pilotjob==0.13.1" && su muscle3 -c -- "cd /home/muscle3/muscle3 && CXXFLAGS=-fPIE OMPI_CXX=clang++ CXX=clang++ make test_examples"' From a0dabc31fe55361b671445efe2834e3e0a2e45f0 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 15 Nov 2022 15:10:57 +0100 Subject: [PATCH 074/183] MMP - get checkpoint info as separate request type --- libmuscle/python/libmuscle/instance.py | 14 +++---- .../python/libmuscle/manager/mmp_server.py | 42 ++++++++----------- .../manager/test/test_mmp_request_handler.py | 19 ++++----- libmuscle/python/libmuscle/mcp/protocol.py | 1 + libmuscle/python/libmuscle/mmp_client.py | 27 +++++++----- .../python/libmuscle/snapshot_manager.py | 19 +++++---- .../python/libmuscle/test/test_instance.py | 6 +-- .../libmuscle/test/test_snapshot_manager.py | 6 +-- 8 files changed, 67 insertions(+), 67 deletions(-) diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index 7e2559d9..c064e6af 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -1,5 +1,4 @@ from copy import copy -from datetime import datetime import logging import os from pathlib import Path @@ -9,7 +8,7 @@ from typing_extensions import Literal from ymmsl import (Identifier, Operator, SettingValue, Port, Reference, - Settings, Checkpoints) + Settings) from libmuscle.communicator import Communicator, Message from libmuscle.settings_manager import SettingsManager @@ -79,11 +78,11 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None self._f_init_cache = dict() # type: _FInitCacheType - checkpoint_info = self._register() + self._register() self._connect() - # Note: SnapshotManager.set_checkpoint_info needs to have the ports + # Note: SnapshotManager.get_checkpoint_info needs to have the ports # initialized so it comes after self._connect() - self._snapshot_manager.set_checkpoint_info(*checkpoint_info) + self._snapshot_manager.get_checkpoint_info() self._set_local_log_level() self._set_remote_log_level() @@ -548,17 +547,16 @@ def save_final_snapshot(self, message: Message) -> None: """ return self._snapshot_manager.save_final_snapshot(message) - def _register(self) -> Tuple[datetime, Checkpoints, Optional[Path]]: + def _register(self) -> None: """Register this instance with the manager. """ register_event = self._profiler.start(ProfileEventType.REGISTER) locations = self._communicator.get_locations() port_list = self.__list_declared_ports() - checkpoint_info = self.__manager.register_instance( + self.__manager.register_instance( self._instance_name(), locations, port_list) register_event.stop() _logger.info('Registered with the manager') - return checkpoint_info def _connect(self) -> None: """Connect this instance to the given peers / conduits. diff --git a/libmuscle/python/libmuscle/manager/mmp_server.py b/libmuscle/python/libmuscle/manager/mmp_server.py index f5b8b692..9382d0eb 100644 --- a/libmuscle/python/libmuscle/manager/mmp_server.py +++ b/libmuscle/python/libmuscle/manager/mmp_server.py @@ -1,7 +1,7 @@ from datetime import datetime, timezone import errno import logging -from typing import Any, Dict, Optional, Tuple, cast, List +from typing import Any, Dict, cast, List import msgpack from ymmsl import ( @@ -23,8 +23,6 @@ _logger = logging.getLogger(__name__) -_EncodedCheckpointType = Dict[str, List[Dict[str, Any]]] - def decode_operator(data: str) -> Operator: """Create an Operator from a MsgPack-compatible value.""" @@ -41,9 +39,10 @@ def encode_conduit(conduit: Conduit) -> List[str]: return [str(conduit.sender), str(conduit.receiver)] -def encode_checkpoints(checkpoints: Checkpoints) -> _EncodedCheckpointType: +def encode_checkpoints(checkpoints: Checkpoints) -> Dict[str, Any]: """Convert a Checkpoins to a MsgPack-compatible value.""" return { + "at_end": checkpoints.at_end, "wallclock_time": [vars(rule) for rule in checkpoints.wallclock_time], "simulation_time": [vars(rule) for rule in checkpoints.simulation_time] } @@ -100,6 +99,8 @@ def handle_request(self, request: bytes) -> bytes: response = self._submit_profile_events(*req_args) elif req_type == RequestType.SUBMIT_SNAPSHOT.value: response = self._submit_snapshot(*req_args) + elif req_type == RequestType.GET_CHECKPOINT_INFO.value: + response = self._get_checkpoint_info(*req_args) return cast(bytes, msgpack.packb(response, use_bin_type=True)) @@ -118,13 +119,6 @@ def _register_instance( status (ResponseType): SUCCESS or ERROR error_msg (str): An error message, only present if status equals ERROR - checkpoint_info (Tuple[str, bytes, Optional[str]]): Checkpoint info, - only present if status equals SUCCESS. The first item is a tuple - encoding of the wallclock reference time (year, month, day, - hour, minute, second, microsecond) in UTC. The second item is a - dict encoding a ymmsl.Checkpoints object. The final item is the - checkpoint filename that the registered instance should resume - from, or None if no resume is requested. """ port_objs = [decode_port(p) for p in ports] instance = Reference(instance_id) @@ -132,8 +126,7 @@ def _register_instance( self._instance_registry.add(instance, locations, port_objs) _logger.info(f'Registered instance {instance_id}') - checkpoint_info = self._get_checkpoint_info(instance) - return [ResponseType.SUCCESS.value, checkpoint_info] + return [ResponseType.SUCCESS.value] except AlreadyRegistered: return [ ResponseType.ERROR.value, @@ -279,28 +272,29 @@ def _submit_snapshot( self._snapshot_registry.register_snapshot(instance, snapshot_obj) return [ResponseType.SUCCESS.value] - def _get_checkpoint_info( - self, - instance: Reference - ) -> Tuple[float, _EncodedCheckpointType, Optional[str]]: + def _get_checkpoint_info(self, instance_id: str) -> Any: """Get checkpoint info for an instance Args: instance: The instance whose checkpoint info to get Returns: - wallclock_reference_time: tuple encoding UTC reference for wallclock - time = 0: (year, month, day, hour, minute, second, microsecond) - checkpoints: yaml-encoded ymmsl.Checkpoints object - resume: path of the snapshot file to resume from (or None if not - resuming) + A list containing the following values on success: + + status (ResponseType): SUCCESS + wallclock_reference_time (float): Unix timestamp (in UTC) indicating + wallclock time of the start of the workflow. + checkpoints (dict): Dictionary encdoing a ymmsl.Checkpoints object. + resume_path (Optional[str]): Checkpoint filename to resume from. """ + instance = Reference(instance_id) resume = None if instance in self._configuration.resume: resume = str(self._configuration.resume[instance]) - return (self._reference_timestamp, + return [ResponseType.SUCCESS.value, + self._reference_timestamp, encode_checkpoints(self._configuration.checkpoints), - resume) + resume] class MMPServer: diff --git a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py index ac80dca2..89de4068 100644 --- a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py +++ b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py @@ -95,31 +95,28 @@ def test_register_instance(mmp_request_handler, instance_registry): assert registered_ports['test_instance'][0].operator == Operator.F_INIT -def test_register_instance_checkpoint_info( - mmp_configuration, mmp_request_handler): +def test_get_checkpoint_info(mmp_configuration, mmp_request_handler): resume_path = Path('/path/to/resume.pack') mmp_configuration.resume = {Reference('test_instance'): resume_path} - mmp_configuration.checkpoints = Checkpoints([CheckpointRangeRule(every=10), - CheckpointAtRule([1, 2, 3.0])]) + mmp_configuration.checkpoints = Checkpoints( + True, + [CheckpointRangeRule(every=10), CheckpointAtRule([1, 2, 3.0])]) - request = [ - RequestType.REGISTER_INSTANCE.value, - 'test_instance', - ['tcp://localhost:10000'], - [['test_in', 'F_INIT']]] + request = [RequestType.GET_CHECKPOINT_INFO.value, 'test_instance'] encoded_request = msgpack.packb(request, use_bin_type=True) result = mmp_request_handler.handle_request(encoded_request) decoded_result = msgpack.unpackb(result, raw=False) assert decoded_result[0] == ResponseType.SUCCESS.value - timestamp, checkpoints, resume = decoded_result[1] + timestamp, checkpoints, resume = decoded_result[1:] ref_time = datetime.fromtimestamp(timestamp, tz=timezone.utc) assert ref_time == mmp_request_handler._reference_time assert isinstance(checkpoints, dict) - assert checkpoints.keys() == {'wallclock_time', 'simulation_time'} + assert checkpoints.keys() == {'at_end', 'wallclock_time', 'simulation_time'} + assert checkpoints['at_end'] is True assert checkpoints['simulation_time'] == [] wallclock_time = checkpoints['wallclock_time'] assert len(wallclock_time) == 2 diff --git a/libmuscle/python/libmuscle/mcp/protocol.py b/libmuscle/python/libmuscle/mcp/protocol.py index 06d1c0da..5d1217ed 100644 --- a/libmuscle/python/libmuscle/mcp/protocol.py +++ b/libmuscle/python/libmuscle/mcp/protocol.py @@ -21,6 +21,7 @@ class RequestType(Enum): SUBMIT_LOG_MESSAGE = 5 SUBMIT_PROFILE_EVENTS = 6 SUBMIT_SNAPSHOT = 7 + GET_CHECKPOINT_INFO = 8 # MUSCLE Peer Protocol GET_NEXT_MESSAGE = 21 diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py index 6a3fe729..37effdca 100644 --- a/libmuscle/python/libmuscle/mmp_client.py +++ b/libmuscle/python/libmuscle/mmp_client.py @@ -57,12 +57,12 @@ def decode_checkpoint_rule(rule: Dict[str, Any]) -> CheckpointRule: return CheckpointAtRule(**rule) if rule.keys() == {'start', 'stop', 'every'}: return CheckpointRangeRule(**rule) - raise ValueError('Cannot convert {rule} to a checkpoint rule.') + raise ValueError(f'Cannot convert {rule} to a checkpoint rule.') def decode_checkpoint_info( reference_timestamp: float, - checkpoints_dict: Dict[str, List[Dict[str, Any]]], + checkpoints_dict: Dict[str, Any], resume: Optional[str] ) -> Tuple[datetime, Checkpoints, Optional[Path]]: """Decode checkpoint info from a MsgPack-compatible value. @@ -80,6 +80,7 @@ def decode_checkpoint_info( """ ref_time = datetime.fromtimestamp(reference_timestamp, tz=timezone.utc) checkpoints = Checkpoints( + at_end=checkpoints_dict["at_end"], wallclock_time=[decode_checkpoint_rule(rule) for rule in checkpoints_dict["wallclock_time"]], simulation_time=[decode_checkpoint_rule(rule) @@ -161,9 +162,21 @@ def get_settings(self) -> Settings: response = self._call_manager(request) return Settings(response[1]) + def get_checkpoint_info(self, name: Reference + ) -> Tuple[datetime, Checkpoints, Optional[Path]]: + """Get the checkpoint info from the manager. + + Returns: + wallclock_time_reference: UTC time where wallclock_time = 0 + checkpoints: checkpoint configuration + resume: path to the resume snapshot + """ + request = [RequestType.GET_CHECKPOINT_INFO.value, str(name)] + response = self._call_manager(request) + return decode_checkpoint_info(*response[1:]) + def register_instance(self, name: Reference, locations: List[str], - ports: List[Port] - ) -> Tuple[datetime, Checkpoints, Optional[Path]]: + ports: List[Port]) -> None: """Register a component instance with the manager. Args: @@ -171,11 +184,6 @@ def register_instance(self, name: Reference, locations: List[str], locations: List of places where the instance can be reached. ports: List of ports of this instance. - - Returns: - wallclock_time_reference: UTC time where wallclock_time = 0 - checkpoints: checkpoint configuration - resume: path to the resume snapshot """ request = [ RequestType.REGISTER_INSTANCE.value, @@ -185,7 +193,6 @@ def register_instance(self, name: Reference, locations: List[str], if response[0] == ResponseType.ERROR.value: raise RuntimeError( f'Error registering instance: {response[1]}') - return decode_checkpoint_info(*response[1]) def request_peers( self, name: Reference) -> Tuple[ diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py index 4061f092..ffd7e4b2 100644 --- a/libmuscle/python/libmuscle/snapshot_manager.py +++ b/libmuscle/python/libmuscle/snapshot_manager.py @@ -46,14 +46,17 @@ def __init__(self, self._snapshot_directory = None # type: Optional[Path] self._next_snapshot_num = 1 - def set_checkpoint_info(self, - utc_reference: datetime, - checkpoints: Checkpoints, - resume: Optional[Path]) -> None: - """Callback after registering with the manager. - - Provide the snapshot manager with info on workflow checkpoints and if we - should resume from a previous snapshot. + def get_checkpoint_info(self) -> None: + """Request checkpoint info from the muscle manager. + """ + checkpoint_info = self._manager.get_checkpoint_info(self._instance_id) + self._set_checkpoint_info(*checkpoint_info) + + def _set_checkpoint_info(self, + utc_reference: datetime, + checkpoints: Checkpoints, + resume: Optional[Path]) -> None: + """Apply checkpoint info received from the manager. Args: utc_reference: datetime (in UTC) indicating wallclock_time=0 diff --git a/libmuscle/python/libmuscle/test/test_instance.py b/libmuscle/python/libmuscle/test/test_instance.py index e8c7f9b0..54044a00 100644 --- a/libmuscle/python/libmuscle/test/test_instance.py +++ b/libmuscle/python/libmuscle/test/test_instance.py @@ -50,7 +50,7 @@ def instance(sys_argv_instance): mmp_client_object = MagicMock() mmp_client_object.request_peers.return_value = (None, None, None) checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None) - mmp_client_object.register_instance.return_value = checkpoint_info + mmp_client_object.get_checkpoint_info.return_value = checkpoint_info mmp_client.return_value = mmp_client_object instance = Instance({ @@ -68,7 +68,7 @@ def instance2(sys_argv_instance): mmp_client_object = MagicMock() mmp_client_object.request_peers.return_value = (None, None, None) checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None) - mmp_client_object.register_instance.return_value = checkpoint_info + mmp_client_object.get_checkpoint_info.return_value = checkpoint_info mmp_client.return_value = mmp_client_object instance = Instance({ Operator.F_INIT: ['in[]'], @@ -83,7 +83,7 @@ def test_create_instance( mmp_client_object = MagicMock() mmp_client_object.request_peers.return_value = (None, None, None) checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None) - mmp_client_object.register_instance.return_value = checkpoint_info + mmp_client_object.get_checkpoint_info.return_value = checkpoint_info mmp_client.return_value = mmp_client_object ports = { Operator.F_INIT: ['in'], diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py index bfee09a1..462c4cd9 100644 --- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py +++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py @@ -19,7 +19,7 @@ def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path communicator.get_message_counts.return_value = {} snapshot_manager = SnapshotManager(Reference('test'), manager, communicator) - snapshot_manager.set_checkpoint_info( + snapshot_manager._set_checkpoint_info( datetime.now(timezone.utc), Checkpoints(), None) snapshot_manager.reuse_instance(None, Path(tmp_path)) @@ -45,7 +45,7 @@ def test_save_load_checkpoint(tmp_path: Path) -> None: snapshot_manager = SnapshotManager(instance_id, manager, communicator) checkpoints = Checkpoints(simulation_time=[CheckpointRangeRule(every=1)]) - snapshot_manager.set_checkpoint_info( + snapshot_manager._set_checkpoint_info( datetime.now(timezone.utc), checkpoints, None) snapshot_manager.reuse_instance(None, tmp_path) @@ -73,7 +73,7 @@ def test_save_load_checkpoint(tmp_path: Path) -> None: snapshot_manager2 = SnapshotManager(instance_id, manager, communicator) - snapshot_manager2.set_checkpoint_info( + snapshot_manager2._set_checkpoint_info( datetime.now(timezone.utc), checkpoints, fpath) communicator.restore_message_counts.assert_called_with(port_message_counts) From d3bb5cdeedb2b712ce9ab953f61c916eceaa33af Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Wed, 16 Nov 2022 13:52:10 +0100 Subject: [PATCH 075/183] Revert std::move additions Give warnings in clang++ version 10, but not in version 14. Having std::move prevents RVO, so removing it. --- libmuscle/cpp/src/libmuscle/data.cpp | 2 +- libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp | 2 +- libmuscle/cpp/src/libmuscle/mpp_message.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libmuscle/cpp/src/libmuscle/data.cpp b/libmuscle/cpp/src/libmuscle/data.cpp index 666a8423..53b57694 100644 --- a/libmuscle/cpp/src/libmuscle/data.cpp +++ b/libmuscle/cpp/src/libmuscle/data.cpp @@ -962,7 +962,7 @@ DataConstRef DataConstRef::grid_data_( Data result = Data::byte_array(num_elems); char * data_copy = result.as_byte_array(); std::copy(data, data + num_elems, data_copy); - return std::move(result); + return result; } } diff --git a/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp b/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp index c0e95b90..959737d2 100644 --- a/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp +++ b/libmuscle/cpp/src/libmuscle/mcp/tcp_transport_client.cpp @@ -129,7 +129,7 @@ DataConstRef TcpTransportClient::call( int64_t length = recv_int64(socket_fd_); auto result = Data::byte_array(length); recv_all(socket_fd_, result.as_byte_array(), result.size()); - return std::move(result); + return result; } void TcpTransportClient::close() { diff --git a/libmuscle/cpp/src/libmuscle/mpp_message.cpp b/libmuscle/cpp/src/libmuscle/mpp_message.cpp index 5f796224..2962e31c 100644 --- a/libmuscle/cpp/src/libmuscle/mpp_message.cpp +++ b/libmuscle/cpp/src/libmuscle/mpp_message.cpp @@ -76,7 +76,7 @@ DataConstRef MPPMessage::encoded() const { auto bytes = Data::byte_array(sbuf.size()); memcpy(bytes.as_byte_array(), sbuf.data(), sbuf.size()); - return std::move(bytes); + return bytes; } } } From c697d532075ac871892a562c743f10f008e98eec Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Wed, 16 Nov 2022 13:52:58 +0100 Subject: [PATCH 076/183] Removing const in method that returns by value --- libmuscle/cpp/src/libmuscle/peer_manager.cpp | 2 +- libmuscle/cpp/src/libmuscle/peer_manager.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libmuscle/cpp/src/libmuscle/peer_manager.cpp b/libmuscle/cpp/src/libmuscle/peer_manager.cpp index 51772072..d5e8923c 100644 --- a/libmuscle/cpp/src/libmuscle/peer_manager.cpp +++ b/libmuscle/cpp/src/libmuscle/peer_manager.cpp @@ -65,7 +65,7 @@ std::vector PeerManager::get_peer_locations( return peer_locations_.at(peer_instance); } -std::vector const PeerManager::get_peer_endpoints( +std::vector PeerManager::get_peer_endpoints( Identifier const & port, std::vector const & slot ) const diff --git a/libmuscle/cpp/src/libmuscle/peer_manager.hpp b/libmuscle/cpp/src/libmuscle/peer_manager.hpp index c6ac5ff7..024b40fa 100644 --- a/libmuscle/cpp/src/libmuscle/peer_manager.hpp +++ b/libmuscle/cpp/src/libmuscle/peer_manager.hpp @@ -83,7 +83,7 @@ class PeerManager { * @param slot The slot to send or receive on. * @return The peer endpoints. */ - std::vector const get_peer_endpoints( + std::vector get_peer_endpoints( ymmsl::Identifier const & port, std::vector const & slot) const; From 06fdcdffe255e948a6bbbd3548379fa92d1f64f9 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Wed, 16 Nov 2022 17:10:42 +0100 Subject: [PATCH 077/183] More unified integration tests with native code --- integration_test/conftest.py | 118 ++++++++++++++----- integration_test/test_cpp_macro_micro.py | 48 ++------ integration_test/test_fortran_macro_micro.py | 54 ++------- integration_test/test_multicast_cpp.py | 66 ++--------- 4 files changed, 113 insertions(+), 173 deletions(-) diff --git a/integration_test/conftest.py b/integration_test/conftest.py index 76cf680d..ad59842a 100644 --- a/integration_test/conftest.py +++ b/integration_test/conftest.py @@ -1,6 +1,9 @@ import logging import multiprocessing as mp import os +import subprocess +import sys +from contextlib import contextmanager, ExitStack from pathlib import Path import pytest @@ -47,9 +50,80 @@ def make_server_process(ymmsl_doc, tmpdir): process.join() +def _python_wrapper(instance_name, muscle_manager, callable): + sys.argv.append(f'--muscle-instance={instance_name}') + sys.argv.append(f'--muscle-manager={muscle_manager}') + callable() + + +def run_manager_with_actors( + ymmsl_text, tmpdir, + cpp_actors={}, fortran_actors={}, python_actors={}): + """Start muscle_manager along with C++ and python actors. + + C++ actors are a dict of instance->executable_path. Executable paths are + assumed to be relative to ../libmuscle/cpp/build/. LD_LIBRARY_PATH is + automatically updated to include the msgpack library path. + + Fortran actors are a dict of instance->executable_path. Executable paths are + assumed to be relative to ../libmuscle/fortran/build/. LD_LIBRARY_PATH is + automatically updated to include the msgpack library path. + + Python actors are a dict of instance->callable, where the callable + implements the python actor. + """ + env = os.environ.copy() + ymmsl_doc = ymmsl.load(ymmsl_text) + libmuscle_dir = Path(__file__).parents[1] / 'libmuscle' + cpp_build_dir = libmuscle_dir / 'cpp' / 'build' + fortran_build_dir = libmuscle_dir / 'fortran' / 'build' + + with ExitStack() as stack: + # start muscle_manager and extract manager location + ctx = contextmanager(make_server_process)(ymmsl_doc, tmpdir) + env['MUSCLE_MANAGER'] = stack.enter_context(ctx) + + lib_paths = [cpp_build_dir / 'msgpack' / 'msgpack' / 'lib'] + if 'LD_LIBRARY_PATH' in env: + env['LD_LIBRARY_PATH'] += ':' + ':'.join(map(str, lib_paths)) + else: + env['LD_LIBRARY_PATH'] = ':'.join(map(str, lib_paths)) + + native_processes = [] + # start native actors + for actors, build_dir in ((cpp_actors, cpp_build_dir), + (fortran_actors, fortran_build_dir)): + for instance_name, executable_path in actors.items(): + executable = build_dir / executable_path + f_out = stack.enter_context( + (tmpdir / f'{instance_name}_stdout.txt').open('w')) + f_err = stack.enter_context( + (tmpdir / f'{instance_name}_stderr.txt').open('w')) + native_processes.append(subprocess.Popen( + [str(executable), f'--muscle-instance={instance_name}'], + env=env, stdout=f_out, stderr=f_err)) + + # start python actors + python_processes = [] + for instance_name, callable in python_actors.items(): + proc = mp.Process( + target=_python_wrapper, + args=(instance_name, env['MUSCLE_MANAGER'], callable)) + proc.start() + python_processes.append(proc) + + # check results + for proc in native_processes: + proc.wait() + assert proc.returncode == 0 + for proc in python_processes: + proc.join() + assert proc.exitcode == 0 + + @pytest.fixture -def mmp_server_process(yatiml_log_warning, tmpdir): - ymmsl_text = ( +def mmp_server_config(yatiml_log_warning): + return ( 'ymmsl_version: v0.1\n' 'model:\n' ' name: test_model\n' @@ -74,14 +148,17 @@ def mmp_server_process(yatiml_log_warning, tmpdir): ' macro_implementation: macro.py\n' ' micro_implementation: micro.py\n' ) - ymmsl_doc = ymmsl.load(ymmsl_text) + +@pytest.fixture +def mmp_server_process(mmp_server_config, tmpdir): + ymmsl_doc = ymmsl.load(mmp_server_config) yield from make_server_process(ymmsl_doc, tmpdir) @pytest.fixture -def mmp_server_process_simple(tmpdir, yatiml_log_warning): - ymmsl_text = ( +def mmp_server_config_simple(yatiml_log_warning): + return ( 'ymmsl_version: v0.1\n' 'model:\n' ' name: test_model\n' @@ -101,36 +178,17 @@ def mmp_server_process_simple(tmpdir, yatiml_log_warning): ' - [1.0, 2.0]\n' ' - [3.0, 1.0]\n' ) - ymmsl_doc = ymmsl.load(ymmsl_text) + +@pytest.fixture +def mmp_server_process_simple(mmp_server_config_simple, tmpdir): + ymmsl_doc = ymmsl.load(mmp_server_config_simple) yield from make_server_process(ymmsl_doc, tmpdir) @pytest.fixture -def mmp_server(yatiml_log_warning): - ymmsl_text = ( - 'ymmsl_version: v0.1\n' - 'model:\n' - ' name: test_model\n' - ' components:\n' - ' macro: macro_implementation\n' - ' micro:\n' - ' implementation: micro_implementation\n' - ' multiplicity: [10]\n' - ' conduits:\n' - ' macro.out: micro.in\n' - ' micro.out: macro.in\n' - 'settings:\n' - ' test1: 13\n' - ' test2: 13.3\n' - ' test3: testing\n' - ' test4: True\n' - ' test5: [2.3, 5.6]\n' - ' test6:\n' - ' - [1.0, 2.0]\n' - ' - [3.0, 1.0]\n' - ) - ymmsl_doc = ymmsl.load(ymmsl_text) +def mmp_server(mmp_server_config_simple, yatiml_log_warning): + ymmsl_doc = ymmsl.load(mmp_server_config_simple) manager = Manager(ymmsl_doc) yield manager._server diff --git a/integration_test/test_cpp_macro_micro.py b/integration_test/test_cpp_macro_micro.py index 02156a44..fda1a232 100644 --- a/integration_test/test_cpp_macro_micro.py +++ b/integration_test/test_cpp_macro_micro.py @@ -1,21 +1,11 @@ -import multiprocessing as mp -import os from pathlib import Path -import subprocess -import sys import numpy as np from libmuscle import Instance, Message from ymmsl import Operator -from .conftest import skip_if_python_only - - -def run_macro(instance_id: str, manager_location: str): - sys.argv.append(f'--muscle-instance={instance_id}') - sys.argv.append(f'--muscle-manager={manager_location}') - macro() +from .conftest import skip_if_python_only, run_manager_with_actors def macro(): @@ -47,34 +37,12 @@ def macro(): @skip_if_python_only -def test_cpp_macro_micro(mmp_server_process_simple, tmp_path): +def test_cpp_macro_micro(mmp_server_config_simple, tmp_path): # create C++ micro model # see libmuscle/cpp/src/libmuscle/tests/micro_model_test.cpp - cpp_build_dir = Path(__file__).parents[1] / 'libmuscle' / 'cpp' / 'build' - env = os.environ.copy() - lib_paths = [cpp_build_dir / 'msgpack' / 'msgpack' / 'lib'] - if 'LD_LIBRARY_PATH' in env: - env['LD_LIBRARY_PATH'] += ':' + ':'.join(map(str, lib_paths)) - else: - env['LD_LIBRARY_PATH'] = ':'.join(map(str, lib_paths)) - - env['MUSCLE_MANAGER'] = mmp_server_process_simple - cpp_test_dir = cpp_build_dir / 'libmuscle' / 'tests' - cpp_test_micro = cpp_test_dir / 'micro_model_test' - - with (tmp_path / 'cpp_stdout.txt').open('w') as f_out: - with (tmp_path / 'cpp_stderr.txt').open('w') as f_err: - micro_result = subprocess.Popen( - [str(cpp_test_micro), '--muscle-instance=micro'], env=env, - stdout=f_out, stderr=f_err) - - # run macro model - macro_process = mp.Process( - target=run_macro, args=('macro', mmp_server_process_simple)) - macro_process.start() - - # check results - micro_result.wait() - assert micro_result.returncode == 0 - macro_process.join() - assert macro_process.exitcode == 0 + run_manager_with_actors( + mmp_server_config_simple, + tmp_path, + {'micro': Path('libmuscle') / 'tests' / 'micro_model_test'}, + {}, + {'macro': macro}) diff --git a/integration_test/test_fortran_macro_micro.py b/integration_test/test_fortran_macro_micro.py index e4908b75..0717891c 100644 --- a/integration_test/test_fortran_macro_micro.py +++ b/integration_test/test_fortran_macro_micro.py @@ -1,21 +1,11 @@ -import multiprocessing as mp -import os from pathlib import Path -import subprocess -import sys import numpy as np from libmuscle import Instance, Message from ymmsl import Operator -from .conftest import skip_if_python_only - - -def run_macro(instance_id: str, manager_location: str): - sys.argv.append(f'--muscle-instance={instance_id}') - sys.argv.append(f'--muscle-manager={manager_location}') - macro() +from .conftest import skip_if_python_only, run_manager_with_actors def macro(): @@ -48,40 +38,12 @@ def macro(): @skip_if_python_only -def test_fortran_macro_micro(mmp_server_process_simple, tmp_path): +def test_fortran_macro_micro(mmp_server_config_simple, tmp_path): # create Fortran micro model # see libmuscle/fortran/src/libmuscle/tests/fortran_micro_model_test.f90 - cpp_build_dir = Path(__file__).parents[1] / 'libmuscle' / 'cpp' / 'build' - env = os.environ.copy() - lib_paths = [cpp_build_dir / 'msgpack' / 'msgpack' / 'lib'] - if 'LD_LIBRARY_PATH' in env: - env['LD_LIBRARY_PATH'] += ':' + ':'.join(map(str, lib_paths)) - else: - env['LD_LIBRARY_PATH'] = ':'.join(map(str, lib_paths)) - - env['MUSCLE_MANAGER'] = mmp_server_process_simple - - fortran_test_dir = ( - Path(__file__).parents[1] / 'libmuscle' / 'fortran' / 'build' / - 'libmuscle' / 'tests') - fortran_test_micro = fortran_test_dir / 'fortran_micro_model_test' - - with (tmp_path / 'fortran_stdout.txt').open('w') as f_out: - with (tmp_path / 'fortran_stderr.txt').open('w') as f_err: - micro_result = subprocess.Popen( - [ - str(fortran_test_micro), '--muscle-instance=micro', - f'--muscle-manager={mmp_server_process_simple}' - ], env=env, stdout=f_out, stderr=f_err) - - # run macro model - macro_process = mp.Process( - target=run_macro, - args=('macro', mmp_server_process_simple)) - macro_process.start() - - # check results - micro_result.wait() - assert micro_result.returncode == 0 - macro_process.join() - assert macro_process.exitcode == 0 + run_manager_with_actors( + mmp_server_config_simple, + tmp_path, + {}, + {'micro': Path('libmuscle') / 'tests' / 'fortran_micro_model_test'}, + {'macro': macro}) diff --git a/integration_test/test_multicast_cpp.py b/integration_test/test_multicast_cpp.py index d97fc0d6..7daa62d3 100644 --- a/integration_test/test_multicast_cpp.py +++ b/integration_test/test_multicast_cpp.py @@ -1,18 +1,10 @@ from pathlib import Path -import sys import ymmsl from libmuscle import Instance -from libmuscle.manager.manager import Manager -from libmuscle.manager.run_dir import RunDir -# when executing this file as a component, .conftest cannot be resolved -if __name__ == "__main__": - def skip_if_python_only(func): - return func -else: - from .conftest import skip_if_python_only +from .conftest import skip_if_python_only, run_manager_with_actors def receiver(): @@ -28,19 +20,9 @@ def receiver(): @skip_if_python_only -def test_multicast_cpp(tmpdir): - tmppath = Path(str(tmpdir)) - - # find our test component and its requirements - cpp_build_dir = Path(__file__).parents[1] / 'libmuscle' / 'cpp' / 'build' - lib_paths = [cpp_build_dir / 'msgpack' / 'msgpack' / 'lib'] - ld_lib_path = ':'.join(map(str, lib_paths)) - - cpp_test_dir = cpp_build_dir / 'libmuscle' / 'tests' - test_component = cpp_test_dir / 'component_test' - - # make config - ymmsl_text = f""" +def test_multicast_cpp(tmp_path): + run_manager_with_actors( + """ ymmsl_version: v0.1 model: name: test_model @@ -54,38 +36,8 @@ def test_multicast_cpp(tmpdir): conduits: multicast.out: - receiver1.in - - receiver2.in -implementations: - component: - env: - LD_LIBRARY_PATH: {ld_lib_path} - executable: {test_component} - receiver: - executable: {sys.executable} - args: - - {__file__} -resources: - multicast: - threads: 1 - receiver1: - threads: 1 - receiver2: - threads: 1""" - - config = ymmsl.load(ymmsl_text) - config.as_configuration().check_consistent() - - # set up - run_dir = RunDir(tmppath / 'run') - - # launch MUSCLE Manager with simulation - manager = Manager(config, run_dir) - manager.start_instances() - success = manager.wait() - - # check that all did not go well - assert success - - -if __name__ == "__main__": - receiver() + - receiver2.in""", + tmp_path, + {'multicast': Path('libmuscle') / 'tests' / 'component_test'}, + {}, + {'receiver1': receiver, 'receiver2': receiver}) From 634f9fdfe2d1947078e6704969499aa6b8f1b559 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 17 Nov 2022 10:07:41 +0100 Subject: [PATCH 078/183] Update checkpoint trigger logic --- .../python/libmuscle/checkpoint_triggers.py | 60 ++++++++++++------- libmuscle/python/libmuscle/instance.py | 15 +++++ .../python/libmuscle/snapshot_manager.py | 27 +++++---- 3 files changed, 68 insertions(+), 34 deletions(-) diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py index ff12918b..dbb4f321 100644 --- a/libmuscle/python/libmuscle/checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/checkpoint_triggers.py @@ -203,6 +203,8 @@ def set_checkpoint_info( self._has_checkpoints = True self._monotonic_reference = _utc_to_monotonic(utc_reference) + self._checkpoint_at_end = checkpoints.at_end + self._wall = CombinedCheckpointTriggers(checkpoints.wallclock_time) self._prevwall = 0.0 self._nextwall = self._wall.next_checkpoint(0.0) # type: Optional[float] @@ -213,7 +215,6 @@ def set_checkpoint_info( self._sim_reset = True self._first_reuse = True - self._max_f_init_next_timestamp = None # type: Optional[float] # These attributes are only used to check if implementations are # following the guidelines @@ -226,44 +227,47 @@ def elapsed_walltime(self) -> float: """ return time.monotonic() - self._monotonic_reference + def snapshots_enabled(self) -> bool: + """Check if the current workflow has snapshots enabled. + """ + return self._has_checkpoints + def should_save_snapshot(self, timestamp: float) -> bool: """Handles instance.should_save_snapshot """ if not self._has_checkpoints: return False - if self._should_have_saved: - _checkpoint_error('"should_save_snapshot" or ' - '"should_save_final_snapshot" returned positive' - ' but no snapshot was saved before the next call') + self.__check_should_have_saved() elapsed_walltime = self.elapsed_walltime() value = self.__should_save(elapsed_walltime, timestamp) self._should_have_saved = value return value - def should_save_final_snapshot(self) -> bool: + def should_save_final_snapshot( + self, do_reuse: bool, f_init_max_timestamp: Optional[float] + ) -> bool: """Handles instance.should_save_final_snapshot """ if not self._has_checkpoints: return False - if self._should_have_saved: - _checkpoint_error('"should_save_snapshot" or ' - '"should_save_final_snapshot" returned positive' - ' but no snapshot was saved before the next call') + self.__check_should_have_saved() value = False - if self._max_f_init_next_timestamp is None: - # If the messages on F_INIT do not supply a next_timestamp, we will - # always snapshot just before O_I + if not do_reuse and self._checkpoint_at_end: value = True - self._last_triggers = ['No "next_timestamp" provided on F_INIT' - ' messages'] + self._last_triggers.append('at_end') + elif f_init_max_timestamp is None: + # No F_INIT messages received: reuse triggered on muscle_settings_in + # message. + _logger.debug('Reuse triggered by muscle_settings_in.' + ' Not creating a snapshot.') + self._sim_reset = True else: elapsed_walltime = self.elapsed_walltime() - value = self.__should_save(elapsed_walltime, - self._max_f_init_next_timestamp) + value = self.__should_save(elapsed_walltime, f_init_max_timestamp) self._should_have_saved = value self._should_save_final_called = True @@ -303,16 +307,15 @@ def update_checkpoints(self, timestamp: float, final: bool) -> None: ' Instance.snapshots_enabled() to test if it is useful' ' to save a snapshot.') return + if final and self._saved_final_checkpoint: + raise RuntimeError( + 'You may only save a final snapshot once per reuse loop.') self._prevwall = self.elapsed_walltime() self._nextwall = self._wall.next_checkpoint(self._prevwall) - if final and self._max_f_init_next_timestamp is not None: - simulation_time = self._max_f_init_next_timestamp - else: - simulation_time = timestamp - self._prevsim = simulation_time - self._nextsim = self._sim.next_checkpoint(simulation_time) + self._prevsim = timestamp + self._nextsim = self._sim.next_checkpoint(timestamp) # this method is also called during resume, after which we no longer # consider the simulation_time as reset @@ -327,6 +330,17 @@ def get_triggers(self) -> List[str]: self._last_triggers = [] return triggers + def __check_should_have_saved(self) -> None: + """Check if a snapshot is saved when required.""" + if self._should_have_saved: + _checkpoint_error('"should_save_snapshot" or ' + '"should_save_final_snapshot" returned positive' + ' but no snapshot was saved before the next call' + ' to a should_save_ method.' + ' You must call the corresponding save_snapshot' + ' or save_final_snapshot method when should_save_' + ' returns True.') + def __should_save(self, walltime: float, simulation_time: float) -> bool: """Check if a checkpoint should be taken diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index c064e6af..44525347 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -390,6 +390,17 @@ def receive_with_settings( """ return self.__receive_message(port_name, slot, default, True) + def snapshots_enabled(self) -> bool: + """Check if the current workflow has snapshots enabled. + + When snapshots are not enabled, all calls to should_save_snapshot and + should_save_final_snapshot will return False. + + Returns: + True iff checkpoint rules are defined in the workflow yMMSL. + """ + return self._snapshot_manager.snapshots_enabled() + def resuming(self) -> bool: """Check if this instance is resuming from a snapshot. @@ -517,6 +528,10 @@ def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool: True iff a final snapshot should be taken by the submodel according to the checkpoint rules provided in the ymmsl configuration. """ + if self._do_reuse is not None: + raise RuntimeError( + 'You may not call should_save_final_snapshot more than once' + ' per reuse loop.') self._do_reuse = self.__check_reuse_instance(apply_overlay) f_init_max_timestamp = max( (msg.timestamp for msg in self._f_init_cache.values()), diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py index ffd7e4b2..00c1a4ca 100644 --- a/libmuscle/python/libmuscle/snapshot_manager.py +++ b/libmuscle/python/libmuscle/snapshot_manager.py @@ -41,7 +41,7 @@ def __init__(self, self._manager = manager self._first_reuse = True - self._trigger = TriggerManager() + self._trigger_manager = TriggerManager() self._resume_from_snapshot = None # type: Optional[Snapshot] self._snapshot_directory = None # type: Optional[Path] self._next_snapshot_num = 1 @@ -63,13 +63,13 @@ def _set_checkpoint_info(self, checkpoints: requested workflow checkpoints resume: previous snapshot to resume from (or None if not resuming) """ - self._trigger.set_checkpoint_info(utc_reference, checkpoints) + self._trigger_manager.set_checkpoint_info(utc_reference, checkpoints) if resume is not None: self.__load_snapshot(resume) snapshot = cast(Snapshot, self._resume_from_snapshot) self._communicator.restore_message_counts( snapshot.port_message_counts) - self._trigger.update_checkpoints( + self._trigger_manager.update_checkpoints( snapshot.message.timestamp, snapshot.is_final_snapshot) @@ -79,7 +79,7 @@ def reuse_instance(self, snapshot_directory: Optional[Path]) -> None: Args: snapshot_directory: Path to store this instance's snapshots in. """ - self._trigger.reuse_instance() + self._trigger_manager.reuse_instance() self._snapshot_directory = snapshot_directory @@ -88,6 +88,11 @@ def reuse_instance(self, snapshot_directory: Optional[Path]) -> None: else: self._resume_from_snapshot = None + def snapshots_enabled(self) -> bool: + """Check if the current workflow has snapshots enabled. + """ + return self._trigger_manager.snapshots_enabled() + def resuming(self) -> bool: """Check if we are resuming during this reuse iteration. """ @@ -115,15 +120,15 @@ def load_snapshot(self) -> Message: def should_save_snapshot(self, timestamp: float) -> bool: """See :meth:`TriggerManager.should_save_snapshot` """ - return self._trigger.should_save_snapshot(timestamp) + return self._trigger_manager.should_save_snapshot(timestamp) def should_save_final_snapshot( - self, do_reuse: bool, - f_init_max_timestamp: Optional[float] + self, do_reuse: bool, f_init_max_timestamp: Optional[float] ) -> bool: """See :meth:`TriggerManager.should_save_final_snapshot` """ - return self._trigger.should_save_final_snapshot() + return self._trigger_manager.should_save_final_snapshot( + do_reuse, f_init_max_timestamp) def save_snapshot(self, msg: Message) -> None: """Save snapshot contained in the message object. @@ -142,8 +147,8 @@ def __save_snapshot(self, msg: Message, final: bool) -> None: msg: message object representing the snapshot final: True iff called from save_final_snapshot """ - triggers = self._trigger.get_triggers() - wallclock_time = self._trigger.elapsed_walltime() + triggers = self._trigger_manager.get_triggers() + wallclock_time = self._trigger_manager.elapsed_walltime() port_message_counts = self._communicator.get_message_counts() snapshot = MsgPackSnapshot( @@ -153,7 +158,7 @@ def __save_snapshot(self, msg: Message, final: bool) -> None: metadata = SnapshotMetadata.from_snapshot(snapshot, str(path)) self._manager.submit_snapshot_metadata(self._instance_id, metadata) - self._trigger.update_checkpoints(msg.timestamp, final) + self._trigger_manager.update_checkpoints(msg.timestamp, final) def __load_snapshot(self, snapshot_location: Path) -> None: """Load a previously stored snapshot from the filesystem From a808a9e870e3a1657b7246b60c41daa82fcc7088 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 17 Nov 2022 10:45:36 +0100 Subject: [PATCH 079/183] Update tox.ini: pass cmdline args to pytest Example: `tox -- --lf` to rerun failed pytest tests --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 717d5107..fcaa0c30 100644 --- a/tox.ini +++ b/tox.ini @@ -18,7 +18,7 @@ passenv = commands = mypy - pytest + pytest {posargs} flake8 libmuscle/python/libmuscle integration_test [gh-actions] From 10d8fb82df0525b2c35b3cc45002a7e4c9510c0b Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 17 Nov 2022 10:51:10 +0100 Subject: [PATCH 080/183] Update checkpoint trigger tests --- .../python/libmuscle/checkpoint_triggers.py | 7 ++- libmuscle/python/libmuscle/instance.py | 6 +- .../python/libmuscle/snapshot_manager.py | 17 ++++-- .../test/test_checkpoint_triggers.py | 60 +++++++++---------- 4 files changed, 49 insertions(+), 41 deletions(-) diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py index dbb4f321..61b6cdca 100644 --- a/libmuscle/python/libmuscle/checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/checkpoint_triggers.py @@ -294,11 +294,12 @@ def reuse_instance(self) -> None: self._saved_final_checkpoint = False def update_checkpoints(self, timestamp: float, final: bool) -> None: - """Update last and next checkpoint times when a snapshot is made + """Update last and next checkpoint times when a snapshot is made. Args: - timestamp: timestamp as reported by the instance - next_timestamp: next timestamp as reported by the instance + timestamp: timestamp as reported by the instance (or from incoming + F_INIT messages when final=True). + final: True iff this is coming from a save_final_snapshot call. """ if not self._has_checkpoints: _logger.info('Saving a snapshot, but no snapshots requested by the' diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index 44525347..efbb9b92 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -560,7 +560,11 @@ def save_final_snapshot(self, message: Message) -> None: attribute can be used to store the internal state of the submodel. """ - return self._snapshot_manager.save_final_snapshot(message) + f_init_max_timestamp = max( + (msg.timestamp for msg in self._f_init_cache.values()), + default=None) + return self._snapshot_manager.save_final_snapshot( + message, f_init_max_timestamp) def _register(self) -> None: """Register this instance with the manager. diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py index 00c1a4ca..5f67bfed 100644 --- a/libmuscle/python/libmuscle/snapshot_manager.py +++ b/libmuscle/python/libmuscle/snapshot_manager.py @@ -135,12 +135,16 @@ def save_snapshot(self, msg: Message) -> None: """ self.__save_snapshot(msg, False) - def save_final_snapshot(self, msg: Message) -> None: + def save_final_snapshot( + self, msg: Message, f_init_max_timestamp: Optional[float]) -> None: """Save final snapshot contained in the message object """ - self.__save_snapshot(msg, True) + self.__save_snapshot(msg, True, f_init_max_timestamp) - def __save_snapshot(self, msg: Message, final: bool) -> None: + def __save_snapshot( + self, msg: Message, final: bool, + f_init_max_timestamp: Optional[float] = None + ) -> None: """Actual implementation used by save_(final_)snapshot. Args: @@ -158,7 +162,12 @@ def __save_snapshot(self, msg: Message, final: bool) -> None: metadata = SnapshotMetadata.from_snapshot(snapshot, str(path)) self._manager.submit_snapshot_metadata(self._instance_id, metadata) - self._trigger_manager.update_checkpoints(msg.timestamp, final) + timestamp = msg.timestamp + if final and f_init_max_timestamp is not None: + # For final snapshots f_init_max_snapshot is the reference time (see + # should_save_Final_snapshot). + timestamp = f_init_max_timestamp + self._trigger_manager.update_checkpoints(timestamp, final) def __load_snapshot(self, snapshot_location: Path) -> None: """Load a previously stored snapshot from the filesystem diff --git a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py index 4f6eed0d..0cbf47b2 100644 --- a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py @@ -148,58 +148,54 @@ def test_trigger_manager_reference_time(): assert 15.0 < elapsed_walltime <= (15.0 + elapsed_monotonic) -@pytest.mark.skip("To be updated") def test_trigger_manager(): reference = datetime.now(timezone.utc) trigger_manager = TriggerManager() trigger_manager.set_checkpoint_info(reference, Checkpoints( + at_end=True, wallclock_time=[CheckpointAtRule([1e-12])], simulation_time=[CheckpointAtRule([1, 3, 5])])) - trigger_manager.reuse_instance(7) + trigger_manager.reuse_instance() - t, t_next = 0.1, 0.2 - assert trigger_manager.should_save_snapshot(t, t_next) + assert trigger_manager.should_save_snapshot(0.1) triggers = trigger_manager.get_triggers() assert len(triggers) == 1 assert "wallclock_time" in triggers[0] with pytest.raises(RuntimeError): # did not call save in between - trigger_manager.should_save_snapshot(t, t_next) - trigger_manager.update_checkpoints(t, t_next, False) + trigger_manager.should_save_snapshot(0.1) + trigger_manager.update_checkpoints(0.1, False) - t, t_next = 0.2, 0.9 - assert not trigger_manager.should_save_snapshot(t, t_next) + assert not trigger_manager.should_save_snapshot(0.99) - t, t_next = 0.9, 3.1 - assert trigger_manager.should_save_snapshot(t, t_next) - assert len(trigger_manager.get_triggers()) == 1 - trigger_manager.update_checkpoints(t, t_next, False) + assert trigger_manager.should_save_snapshot(3.2) + triggers = trigger_manager.get_triggers() + assert len(triggers) == 1 + assert "simulation_time" in triggers[0] + trigger_manager.update_checkpoints(3.2, False) - t, t_next = 3.1, None - assert trigger_manager.should_save_final_snapshot(t) + assert trigger_manager.should_save_final_snapshot(True, 7.0) with pytest.raises(RuntimeError): # did not call save in between - trigger_manager.should_save_snapshot(t, 4.0) + trigger_manager.should_save_snapshot(4.0) with pytest.raises(RuntimeError): # did not call save in between - trigger_manager.should_save_final_snapshot(t) + trigger_manager.should_save_final_snapshot(True, 7.0) assert len(trigger_manager.get_triggers()) > 0 - trigger_manager.update_checkpoints(t, t_next, True) + trigger_manager.update_checkpoints(7.0, True) - trigger_manager.reuse_instance(None) + trigger_manager.reuse_instance() - t, t_next = 7.1, 8.2 - assert not trigger_manager.should_save_snapshot(t, t_next) + assert not trigger_manager.should_save_snapshot(7.1) with pytest.raises(RuntimeError): # no should_save_final called - trigger_manager.reuse_instance(None) - t, t_next = 8.2, None - assert trigger_manager.should_save_final_snapshot(t) + trigger_manager.reuse_instance() + + assert trigger_manager.should_save_final_snapshot(False, None) with pytest.raises(RuntimeError): # not saved - trigger_manager.reuse_instance(None) - trigger_manager.update_checkpoints(t, t_next, True) + trigger_manager.reuse_instance() + trigger_manager.update_checkpoints(7.1, True) - trigger_manager.reuse_instance(None) + trigger_manager.reuse_instance() -@pytest.mark.skip("To be updated") def test_trigger_manager_warnings(caplog: pytest.LogCaptureFixture, monkeypatch: pytest.MonkeyPatch): monkeypatch.setenv("MUSCLE_DISABLE_CHECKPOINT_ERRORS", "1") @@ -209,15 +205,13 @@ def test_trigger_manager_warnings(caplog: pytest.LogCaptureFixture, trigger_manager.set_checkpoint_info(reference, Checkpoints( simulation_time=[CheckpointAtRule([1, 3, 5])])) - trigger_manager.reuse_instance(2) + trigger_manager.reuse_instance() with caplog.at_level(logging.WARN): n_records = len(caplog.records) - assert trigger_manager.should_save_snapshot(1.5, None) - assert len(caplog.records) == n_records + 1 - assert "next_timestamp" in caplog.records[-1].message + assert trigger_manager.should_save_snapshot(1.5) + assert len(caplog.records) == n_records - n_records = len(caplog.records) - trigger_manager.reuse_instance(None) # suppressed error + trigger_manager.reuse_instance() # suppressed error assert len(caplog.records) > n_records assert "Suppressed checkpoint error" in caplog.records[-1].message From 8b8f4fd9f95dacada20cc101c7142b46bf7e1588 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 17 Nov 2022 10:59:57 +0100 Subject: [PATCH 081/183] Update snapshot manager tests --- .../python/libmuscle/checkpoint_triggers.py | 2 +- .../libmuscle/test/test_snapshot_manager.py | 48 +++++++++---------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py index 61b6cdca..57e26ca7 100644 --- a/libmuscle/python/libmuscle/checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/checkpoint_triggers.py @@ -302,7 +302,7 @@ def update_checkpoints(self, timestamp: float, final: bool) -> None: final: True iff this is coming from a save_final_snapshot call. """ if not self._has_checkpoints: - _logger.info('Saving a snapshot, but no snapshots requested by the' + _logger.info('Saving a snapshot but no checkpoints requested by the' ' workflow. Hint: use Instance.should_save_snapshot(),' ' Instance.should_save_final_snapshot() or' ' Instance.snapshots_enabled() to test if it is useful' diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py index 462c4cd9..972e409b 100644 --- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py +++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py @@ -11,7 +11,6 @@ from libmuscle.snapshot_manager import SnapshotManager -@pytest.mark.skip("To be updated") def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path ) -> None: manager = MagicMock() @@ -22,19 +21,19 @@ def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path snapshot_manager._set_checkpoint_info( datetime.now(timezone.utc), Checkpoints(), None) - snapshot_manager.reuse_instance(None, Path(tmp_path)) assert not snapshot_manager.resuming() - assert not snapshot_manager.should_save_snapshot(1, None) - assert not snapshot_manager.should_save_snapshot(5000, None) - assert not snapshot_manager.should_save_final_snapshot(1000) + snapshot_manager.reuse_instance(tmp_path) + assert not snapshot_manager.resuming() + assert not snapshot_manager.should_save_snapshot(1) + assert not snapshot_manager.should_save_snapshot(5000) + assert not snapshot_manager.should_save_final_snapshot(False, None) - with caplog.at_level(logging.INFO, 'libmuscle.snapshot_manager'): + with caplog.at_level(logging.INFO, 'libmuscle'): snapshot_manager.save_snapshot(Message(1.0, None, None)) assert caplog.records[0].levelname == "INFO" assert "no checkpoints" in caplog.records[0].message -@pytest.mark.skip("To be updated") def test_save_load_checkpoint(tmp_path: Path) -> None: manager = MagicMock() communicator = MagicMock() @@ -48,13 +47,14 @@ def test_save_load_checkpoint(tmp_path: Path) -> None: snapshot_manager._set_checkpoint_info( datetime.now(timezone.utc), checkpoints, None) - snapshot_manager.reuse_instance(None, tmp_path) + assert not snapshot_manager.resuming() + snapshot_manager.reuse_instance(tmp_path) with pytest.raises(RuntimeError): snapshot_manager.load_snapshot() assert not snapshot_manager.resuming() - assert snapshot_manager.should_save_snapshot(0.2, 0.4) - snapshot_manager.save_snapshot(Message(0.2, 0.4, 'test data')) + assert snapshot_manager.should_save_snapshot(0.2) + snapshot_manager.save_snapshot(Message(0.2, None, 'test data')) communicator.get_message_counts.assert_called_with() manager.submit_snapshot_metadata.assert_called() @@ -64,30 +64,30 @@ def test_save_load_checkpoint(tmp_path: Path) -> None: assert metadata.triggers assert metadata.wallclock_time > 0.0 assert metadata.timestamp == 0.2 - assert metadata.next_timestamp == 0.4 + assert metadata.next_timestamp is None assert metadata.port_message_counts == port_message_counts assert not metadata.is_final_snapshot - fpath = Path(metadata.snapshot_filename) - assert fpath.parent == tmp_path - assert fpath.name == 'test-1_1.pack' + snapshot_path = Path(metadata.snapshot_filename) + assert snapshot_path.parent == tmp_path + assert snapshot_path.name == 'test-1_1.pack' snapshot_manager2 = SnapshotManager(instance_id, manager, communicator) snapshot_manager2._set_checkpoint_info( - datetime.now(timezone.utc), checkpoints, fpath) + datetime.now(timezone.utc), checkpoints, snapshot_path) communicator.restore_message_counts.assert_called_with(port_message_counts) assert snapshot_manager2.resuming() - snapshot_manager2.reuse_instance(None, tmp_path) + snapshot_manager2.reuse_instance(tmp_path) assert snapshot_manager2.resuming() msg = snapshot_manager2.load_snapshot() assert msg.timestamp == 0.2 - assert msg.next_timestamp == 0.4 + assert msg.next_timestamp is None assert msg.data == 'test data' - assert not snapshot_manager2.should_save_snapshot(0.4, 0.6) - assert snapshot_manager2.should_save_final_snapshot(0.6) - snapshot_manager2.save_final_snapshot(Message(0.6, None, 'test data2')) + assert not snapshot_manager2.should_save_snapshot(0.4) + assert snapshot_manager2.should_save_final_snapshot(True, 1.2) + snapshot_manager2.save_final_snapshot(Message(0.6, None, 'test data2'), 1.2) instance, metadata = manager.submit_snapshot_metadata.call_args[0] assert instance == instance_id @@ -98,10 +98,10 @@ def test_save_load_checkpoint(tmp_path: Path) -> None: assert metadata.next_timestamp is None assert metadata.port_message_counts == port_message_counts assert metadata.is_final_snapshot - fpath = Path(metadata.snapshot_filename) - assert fpath.parent == tmp_path - assert fpath.name == 'test-1_2.pack' + snapshot_path = Path(metadata.snapshot_filename) + assert snapshot_path.parent == tmp_path + assert snapshot_path.name == 'test-1_2.pack' assert snapshot_manager2.resuming() - snapshot_manager2.reuse_instance(None, tmp_path) + snapshot_manager2.reuse_instance(tmp_path) assert not snapshot_manager2.resuming() From d60c5e6680c63818ba4c642376d786d97bfeb796 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 17 Nov 2022 17:38:52 +0100 Subject: [PATCH 082/183] Update snapshot registry and tests - Stateless actors should also send metadata, so no longer special-cased - Workflow snapshot detection algorithm does an exhaustive search - Use frozenset wherever possible --- .../libmuscle/manager/snapshot_registry.py | 178 +++++++++--------- .../manager/test/test_snapshot_registry.py | 134 +++++++------ 2 files changed, 148 insertions(+), 164 deletions(-) diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py index ed1618e3..6bdad7fa 100644 --- a/libmuscle/python/libmuscle/manager/snapshot_registry.py +++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py @@ -7,11 +7,11 @@ from pathlib import Path from queue import Queue from threading import Thread -from typing import Dict, Optional, Set, List, Tuple, TypeVar +from typing import Dict, Optional, Set, FrozenSet, List, Tuple, TypeVar from ymmsl import ( Reference, Model, Identifier, Implementation, save, - PartialConfiguration, ImplementationState as IState) + PartialConfiguration) from libmuscle.manager.topology_store import TopologyStore from libmuscle.snapshot import SnapshotMetadata @@ -91,15 +91,14 @@ class SnapshotNode: snapshots always have a higher num. instance: Which instance this is a snapshot of. snapshot: The snapshot metadata reported by the instance. - stateful_peers: The set of peers that the instance is connected to that - have state, which we need to check consistency with. + peers: The set of peers that the instance is connected to. consistent_peers: Keeps track of snapshots per peer that are consistent with this one. """ num: int instance: Reference snapshot: SnapshotMetadata - stateful_peers: Set[Reference] + peers: FrozenSet[Reference] consistent_peers: Dict[Reference, List["SnapshotNode"]] = field( default_factory=dict, repr=False) @@ -108,10 +107,9 @@ def __hash__(self) -> int: @property def consistent(self) -> bool: - """Returns True iff there is a consistent checkpoint will all stateful - peers. + """Returns True iff there is a consistent checkpoint with all peers. """ - return self.consistent_peers.keys() == self.stateful_peers + return self.consistent_peers.keys() == self.peers def do_consistency_check( self, @@ -194,12 +192,9 @@ def __init__( self._snapshots = {} # type: _SnapshotDictType self._instances = set() # type: Set[Reference] - self._stateful_instances = set() # type: Set[Reference] for component in config.model.components: - instances = set(component.instances()) - self._instances.update(instances) - if self._is_stateful(component.name): - self._stateful_instances.update(instances) + self._instances.update(component.instances()) + # TODO: create snapshot nodes for starting from scratch def register_snapshot( self, instance: Reference, snapshot: SnapshotMetadata) -> None: @@ -233,7 +228,7 @@ def _add_snapshot( instance: The instance that created the snapshot snapshot: Metadata describing the snapshot """ - stateful_peers = self._get_stateful_peers(instance) + stateful_peers = self._get_peers(instance) i_snapshots = self._snapshots.setdefault(instance, []) # get next number of the snapshot @@ -257,45 +252,44 @@ def _save_workflow_snapshot(self, snapshotnode: SnapshotNode) -> None: snapshotnode: The snapshot node that must be part of the workflow snapshot. """ - selected_snapshots = self._get_workflow_snapshot(snapshotnode) - if selected_snapshots is not None: - self._write_snapshot_ymmsl(selected_snapshots) - self._cleanup_snapshots(selected_snapshots) + workflow_snapshots = self._get_workflow_snapshots(snapshotnode) + for workflow_snapshot in workflow_snapshots: + self._write_snapshot_ymmsl(workflow_snapshot) + self._cleanup_snapshots(workflow_snapshots) - def _get_workflow_snapshot( - self, snapshot: SnapshotNode) -> Optional[List[SnapshotNode]]: - """Check if a workflow snapshot exists that contains the provided node. - - Note: if the provided snapshot node is part of multiple workflow - snapshots, only the most recent is detected and written to disk. + def _get_workflow_snapshots( + self, snapshot: SnapshotNode) -> List[List[SnapshotNode]]: + """Return all workflow snapshots which contain the provided node. Args: snapshotnode: The snapshot node that must be part of the workflow snapshot. + + Returns: + List of workflow snapshots. Each workflow snapshot is a list of + instance snapshot nodes. """ - # This implements a greedy assignment algorithm. if not snapshot.consistent: - return None + return [] # Instances that don't have a snapshot node chosen yet: - instances_to_cover = list( - self._stateful_instances - {snapshot.instance}) + instances_to_cover = list(self._instances - {snapshot.instance}) # Allowed snapshots per instance. This is updated during the heuristic # to further restrict the sets of snapshots as peer snapshots are # selected. # First restriction is that the snapshots have to be locally consistent. - allowed_snapshots = {} # type: Dict[Reference, Set[SnapshotNode]] + allowed_snapshots = {} # type: Dict[Reference, FrozenSet[SnapshotNode]] for instance in instances_to_cover: - allowed_snapshots[instance] = set( + allowed_snapshots[instance] = frozenset( i_snapshot for i_snapshot in self._snapshots.get(instance, []) if i_snapshot.consistent) if not allowed_snapshots[instance]: # there cannot be a workflow snapshot if this instance has no # consistent snapshot nodes - return None + return [] instance = snapshot.instance - allowed_snapshots[instance] = {snapshot} + allowed_snapshots[instance] = frozenset({snapshot}) def num_allowed_snapshots(instance: Reference) -> int: """Get number of allowed snapshots at this point for this instance. @@ -305,18 +299,23 @@ def num_allowed_snapshots(instance: Reference) -> int: """ return len(allowed_snapshots[instance]) + # Do a full, depth-first search for all workflow snapshots + # ======================================================== + + workflow_snapshots = [] selected_snapshots = [snapshot] # This stack stores history of allowed_snapshots and enables roll back - stack = [] # type: List[Dict[Reference, Set[SnapshotNode]]] + stack = [] # type: List[Dict[Reference, FrozenSet[SnapshotNode]]] - # update allowed_snapshots for peers + # Update allowed_snapshots for peers of the selected snapshot for peer, snapshots in snapshot.consistent_peers.items(): - allowed_snapshots[peer].intersection_update(snapshots) - if not allowed_snapshots[peer]: - return None + intersection = allowed_snapshots[peer].intersection(snapshots) + if not intersection: + return [] + allowed_snapshots[peer] = intersection - while instances_to_cover: - # select most constrained instance + while True: + # 1. Select most constrained instance # # Note: we're only interested in the instance with the least allowed # snapshots. Better performance may be possible by not doing a full @@ -331,44 +330,46 @@ def num_allowed_snapshots(instance: Reference) -> int: instances_to_cover.sort(key=num_allowed_snapshots, reverse=True) instance = instances_to_cover.pop() - # select latest snapshot of this instance - snapshot = max(allowed_snapshots[instance], key=attrgetter("num")) + # 2. Select the oldest snapshot of this instance + snapshot = min(allowed_snapshots[instance], key=attrgetter('num')) selected_snapshots.append(snapshot) - # we put a shallow copy on the stack, so we are not allowed to - # modify the sets in the dictionary (see below) + # A shallow copy is ok: the values are immutable frozensets stack.append(allowed_snapshots.copy()) - # update allowed snapshots with the currently selected - allowed_snapshots[instance] = {snapshot} + # 3. Update allowed snapshots based on the newly selected + allowed_snapshots[instance] = frozenset({snapshot}) for peer, snapshots in snapshot.consistent_peers.items(): - # not updating in place to preserve set objects in the stack intersection = allowed_snapshots[peer].intersection(snapshots) if not intersection: break # roll back allowed_snapshots[peer] = intersection else: - # not rolling back, go into next iteration of the while-loop - continue + # 4. Selected snapshot is okay to explore further + if instances_to_cover: + # 4a. There are still instance to cover, return to the start + # of the while loop. + continue + # 4b. We have found a complete workflow snapshot + workflow_snapshots.append(selected_snapshots.copy()) + # Next: perform a roll-back to continue the search - # roll back should stop when selected_snapshots only contains the - # one we forced to be part of the workflow snapshot + # 5. Roll back + # stop when selected_snapshots only contains the one we forced to be + # part of the workflow snapshot while len(selected_snapshots) > 1: - # roll back snapshot = selected_snapshots.pop() instance = snapshot.instance instances_to_cover.append(instance) allowed_snapshots = stack.pop() - allowed_snapshots[instance].remove(snapshot) - if allowed_snapshots[instance]: - # we have a valid next snapshot to try for this instance + intersection = allowed_snapshots[instance] - {snapshot} + allowed_snapshots[instance] = intersection + if intersection: + # We have a valid next snapshot to try for this instance break - # no allowed_snapshots, try another roll back + # No allowed_snapshots, try another roll back else: - # we've exhausted roll back possibilities, there is no - # consistent checkpoint - return None - - return selected_snapshots + # Exhausted all roll back possibilities, so we are done now + return workflow_snapshots def _write_snapshot_ymmsl( self, selected_snapshots: List[SnapshotNode]) -> None: @@ -436,20 +437,32 @@ def _generate_description( '\n'.join(component_table)) def _cleanup_snapshots( - self, selected_snapshots: List[SnapshotNode]) -> None: + self, workflow_snapshots: List[List[SnapshotNode]]) -> None: """Remove all snapshots that are older than the selected snapshots. Args: selected_snapshots: All snapshot nodes of a workflow snapshot """ - # remove all snapshots older than the selected ones + if not workflow_snapshots: + return + + # Find the newest snapshots per instance + newest_snapshots = {snapshot.instance: snapshot + for snapshot in workflow_snapshots[0]} + for workflow_snapshot in workflow_snapshots[1:]: + for snapshot in workflow_snapshot: + if newest_snapshots[snapshot.instance].num < snapshot.num: + newest_snapshots[snapshot.instance] = snapshot + + # Remove all snapshots that are older than the newest snapshots removed_snapshots = set() # type: Set[SnapshotNode] - for snapshot in selected_snapshots: + for snapshot in newest_snapshots.values(): all_snapshots = self._snapshots[snapshot.instance] idx = all_snapshots.index(snapshot) self._snapshots[snapshot.instance] = all_snapshots[idx:] removed_snapshots.update(all_snapshots[:idx]) - # remove all references in SnapshotNode.peer_snapshot to the snapshots + + # Remove all references in SnapshotNode.peer_snapshot to the snapshots # that are cleaned up for snapshot in removed_snapshots: for peer_snapshot in chain.from_iterable( @@ -462,23 +475,19 @@ def _cleanup_snapshots( snapshot) @lru_cache(maxsize=None) - def _get_stateful_peers(self, instance: Reference) -> Set[Reference]: - """Return the set of stateful peers for the given instance. + def _get_peers(self, instance: Reference) -> FrozenSet[Reference]: + """Return the set of peers for the given instance. - Note: instance is assumed to contain the full index, not just the kernel - name. + Note: instance is assumed to contain the full index, not just the + component name. Args: - instance: Instance to get stateful peers of. See - :meth:`_is_stateful`. + instance: Instance to get peers of. Returns: - Set with all stateful peer instances (including their index). + Frozen set with all peer instances (including their index). """ - return set( - peer - for peer in self._topology_store.get_peer_instances(instance) - if self._is_stateful(peer.without_trailing_ints())) + return frozenset(self._topology_store.get_peer_instances(instance)) @lru_cache(maxsize=None) def _get_connections(self, instance: Reference, peer: Reference @@ -555,22 +564,3 @@ def _implementation(self, kernel: Reference) -> Optional[Implementation]: if implementation in self._configuration.implementations: return self._configuration.implementations[implementation] return None - - @lru_cache(maxsize=None) - def _is_stateful(self, kernel: Reference) -> bool: - """Check if a kernel has a stateful implementation. - - A kernel is considered stateful if: - - There is no Implementation given for the kernel - - Implementation.stateful = ImplementationState.STATEFUL - - Implementation.stateful = ImplementationState.WEAKLY_STATEFUL and the - implementation supports checkpointing. In this case we assume to get - snapshots from these kernels and we take them into account in the - snapshot graph. - """ - implementation = self._implementation(kernel) - if implementation is None: - return True # assume stateful - return (implementation.stateful is IState.STATEFUL or - implementation.stateful is IState.WEAKLY_STATEFUL and - implementation.supports_checkpoint) diff --git a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py index 71e3fb7c..dd6c0c46 100644 --- a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py +++ b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py @@ -158,25 +158,23 @@ def test_snapshot_config(): print(config.description) -def test_stateful_peers(uq: Configuration, micro_is_stateless: bool) -> None: +def test_peers(uq: Configuration) -> None: snapshot_registry = SnapshotRegistry(uq, None, TopologyStore(uq)) macro = Reference('macro') micro = Reference('micro') qmc = Reference('qmc') rr = Reference('rr') - expected_stateful = {qmc, rr} | {macro + i for i in range(5)} - if not micro_is_stateless: - expected_stateful.update(micro + i for i in range(5)) - assert snapshot_registry._stateful_instances == expected_stateful + all_instances = {qmc, rr} | {macro + i for i in range(5)} + all_instances.update(micro + i for i in range(5)) + assert snapshot_registry._instances == all_instances - assert snapshot_registry._get_stateful_peers(qmc) == {rr} + assert snapshot_registry._get_peers(qmc) == {rr} expected_rr_peers = {qmc} | {macro + i for i in range(5)} - assert snapshot_registry._get_stateful_peers(rr) == expected_rr_peers + assert snapshot_registry._get_peers(rr) == expected_rr_peers for i in range(5): - expected_peers = {rr} if micro_is_stateless else {rr, micro + i} - assert snapshot_registry._get_stateful_peers(macro + i) == expected_peers - assert snapshot_registry._get_stateful_peers(micro + i) == {macro + i} + assert snapshot_registry._get_peers(macro + i) == {rr, micro + i} + assert snapshot_registry._get_peers(micro + i) == {macro + i} def test_connections(uq: Configuration) -> None: @@ -238,19 +236,7 @@ def test_implementation(uq: Configuration) -> None: assert missing_impl is None -def test_stateful(uq: Configuration, micro_is_stateless: bool) -> None: - uq.implementations['macro_impl'].stateful = IState.WEAKLY_STATEFUL - snapshot_registry = SnapshotRegistry(uq, None, TopologyStore(uq)) - - assert snapshot_registry._is_stateful(Reference('macro')) - stateful = snapshot_registry._is_stateful(Reference('micro')) - assert stateful is not micro_is_stateless - - assert snapshot_registry._is_stateful(Reference('unknown')) - - -def test_macro_micro_snapshots( - macro_micro: Configuration, micro_is_stateless: bool) -> None: +def test_macro_micro_snapshots(macro_micro: Configuration) -> None: snapshot_registry = SnapshotRegistry( macro_micro, None, TopologyStore(macro_micro)) # prevent actually writing a ymmsl file, testing that separately @@ -263,58 +249,71 @@ def test_macro_micro_snapshots( assert len(snapshot_registry._snapshots[macro]) == 1 node = snapshot_registry._snapshots[macro][0] - assert node.consistent is micro_is_stateless + assert node.consistent is False assert node.consistent_peers == {} assert node.instance == macro assert node.num == 1 assert node.snapshot is macro_snapshot - if micro_is_stateless: - assert node.stateful_peers == set() - snapshot_registry._write_snapshot_ymmsl.assert_called_once_with([node]) - snapshot_registry._write_snapshot_ymmsl.reset_mock() - else: - assert node.stateful_peers == {micro} - snapshot_registry._write_snapshot_ymmsl.assert_not_called() + assert node.peers == {micro} + snapshot_registry._write_snapshot_ymmsl.assert_not_called() - if not micro_is_stateless: - # Note: this snapshot is not realistic, it should have come in before - # the macro snapshot above. However, it's still useful for testing the - # consistency algorithm - micro_snapshot = make_snapshot(f_i=[2], o_f=[1]) - snapshot_registry._add_snapshot(micro, micro_snapshot) + # Note: this snapshot is not realistic, it should have come in before + # the macro snapshot above. However, it's still useful for testing the + # consistency algorithm + micro_snapshot = make_snapshot(f_i=[2], o_f=[1]) + snapshot_registry._add_snapshot(micro, micro_snapshot) - assert len(snapshot_registry._snapshots[micro]) == 1 - assert not snapshot_registry._snapshots[micro][0].consistent - snapshot_registry._write_snapshot_ymmsl.assert_not_called() + assert len(snapshot_registry._snapshots[micro]) == 1 + assert snapshot_registry._snapshots[micro][0].consistent is False + snapshot_registry._write_snapshot_ymmsl.assert_not_called() - micro_snapshot = make_snapshot(f_i=[3], o_f=[2]) - snapshot_registry._add_snapshot(micro, micro_snapshot) + micro_snapshot = make_snapshot(f_i=[3], o_f=[2]) + snapshot_registry._add_snapshot(micro, micro_snapshot) - # micro snapshots should be cleaned up now! - assert len(snapshot_registry._snapshots[micro]) == 1 - micro_node = snapshot_registry._snapshots[micro][0] - assert micro_node.consistent - snapshot_registry._write_snapshot_ymmsl.assert_called_with( - [micro_node, node]) - snapshot_registry._write_snapshot_ymmsl.reset_mock() + # The first micro snapshots should be cleaned up now + assert len(snapshot_registry._snapshots[micro]) == 1 + micro_node = snapshot_registry._snapshots[micro][0] + assert micro_node.consistent + snapshot_registry._write_snapshot_ymmsl.assert_called_once_with( + [micro_node, node]) + snapshot_registry._write_snapshot_ymmsl.reset_mock() + # 3 micro snapshots in the same reuse: + for _ in range(3): micro_snapshot = make_snapshot(f_i=[4], o_f=[3]) snapshot_registry._add_snapshot(micro, micro_snapshot) - # micro snapshots should be cleaned up now! - assert len(snapshot_registry._snapshots[micro]) == 1 - micro_node = snapshot_registry._snapshots[micro][0] - assert micro_node.consistent - snapshot_registry._write_snapshot_ymmsl.assert_called_with( - [micro_node, node]) - snapshot_registry._write_snapshot_ymmsl.reset_mock() + # Previous micro snapshot should be cleaned up now + assert len(snapshot_registry._snapshots[micro]) == 1 + micro_node = snapshot_registry._snapshots[micro][-1] + assert snapshot_registry._write_snapshot_ymmsl.call_count == 3 + snapshot_registry._write_snapshot_ymmsl.assert_called_with( + [micro_node, node]) + snapshot_registry._write_snapshot_ymmsl.reset_mock() macro_snapshot = make_snapshot(o_i=[4], s=[4]) snapshot_registry._add_snapshot(macro, macro_snapshot) snapshot_registry._write_snapshot_ymmsl.assert_called_once() + snapshot_registry._write_snapshot_ymmsl.reset_mock() + # 3 micro snapshots in the same reuse, but inconcistent with previous macro + for _ in range(3): + micro_snapshot = make_snapshot(f_i=[6], o_f=[5]) + snapshot_registry._add_snapshot(micro, micro_snapshot) -def test_uq(uq: Configuration, micro_is_stateless: bool) -> None: + # All three should be present now in addition to the one last used in + # workflow snapshot + assert len(snapshot_registry._snapshots[micro]) == 4 + snapshot_registry._write_snapshot_ymmsl.assert_not_called() + + macro_snapshot = make_snapshot(o_i=[6], s=[6]) + snapshot_registry._add_snapshot(macro, macro_snapshot) + assert snapshot_registry._write_snapshot_ymmsl.call_count == 3 + assert len(snapshot_registry._snapshots[micro]) == 1 + assert len(snapshot_registry._snapshots[macro]) == 1 + + +def test_uq(uq: Configuration) -> None: snapshot_registry = SnapshotRegistry(uq, None, TopologyStore(uq)) # prevent actually writing a ymmsl file, testing that separately snapshot_registry._write_snapshot_ymmsl = MagicMock() @@ -342,24 +341,19 @@ def test_uq(uq: Configuration, micro_is_stateless: bool) -> None: snapshot_registry._add_snapshot(macro + i, macro_snapshot) node = snapshot_registry._snapshots[macro + i][-1] assert node.consistent_peers.keys() == {rr} - if micro_is_stateless and i == 4: + snapshot_registry._write_snapshot_ymmsl.assert_not_called() + + micro_snapshot = make_snapshot(f_i=[1], o_f=[0]) + for i in range(5): + snapshot_registry._add_snapshot(micro + i, micro_snapshot) + node = snapshot_registry._snapshots[micro + i][-1] + assert node.consistent_peers.keys() == {macro + i} + if i == 4: snapshot_registry._write_snapshot_ymmsl.assert_called_once() snapshot_registry._write_snapshot_ymmsl.reset_mock() else: snapshot_registry._write_snapshot_ymmsl.assert_not_called() - if not micro_is_stateless: - micro_snapshot = make_snapshot(f_i=[1], o_f=[0]) - for i in range(5): - snapshot_registry._add_snapshot(micro + i, micro_snapshot) - node = snapshot_registry._snapshots[micro + i][-1] - assert node.consistent_peers.keys() == {macro + i} - if i == 4: - snapshot_registry._write_snapshot_ymmsl.assert_called_once() - snapshot_registry._write_snapshot_ymmsl.reset_mock() - else: - snapshot_registry._write_snapshot_ymmsl.assert_not_called() - qmc_snapshot = make_snapshot(parameters_out=[1, 1, 1, 1, 1], states_in=[]) snapshot_registry._add_snapshot(qmc, qmc_snapshot) node = snapshot_registry._snapshots[qmc][-1] From 4a3e9c5991f7bb0b1f210ca7d38236fe1b71f61a Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 17 Nov 2022 17:51:00 +0100 Subject: [PATCH 083/183] Move deregister from manager to reuse_instance() --- libmuscle/python/libmuscle/instance.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index efbb9b92..a7e5ec63 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -146,6 +146,10 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool: snapshot_path = None self._snapshot_manager.reuse_instance(snapshot_path) + if not do_reuse: + self._deregister() + self.__manager.close() + return do_reuse def error_shutdown(self, message: str) -> None: @@ -677,8 +681,6 @@ def __check_reuse_instance(self, apply_overlay: bool) -> bool: if not do_reuse: self.__close_ports() self._communicator.shutdown() - self._deregister() - self.__manager.close() return do_reuse def __receive_message( From 3da1e3e13353c25203027f1d33be479ce9830428 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Fri, 18 Nov 2022 19:03:13 +0100 Subject: [PATCH 084/183] Run native compatibility test also with older OS versions --- .github/workflows/ci_ubuntu18.04_clang.yaml | 19 +++++++++++++++++++ .github/workflows/ci_ubuntu20.04.yaml | 2 +- .github/workflows/ci_ubuntu20.04_clang.yaml | 19 +++++++++++++++++++ ...4_clang.yaml => ci_ubuntu22.04_clang.yaml} | 3 +-- 4 files changed, 40 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/ci_ubuntu18.04_clang.yaml create mode 100644 .github/workflows/ci_ubuntu20.04_clang.yaml rename .github/workflows/{ci_ubuntu_22.04_clang.yaml => ci_ubuntu22.04_clang.yaml} (95%) diff --git a/.github/workflows/ci_ubuntu18.04_clang.yaml b/.github/workflows/ci_ubuntu18.04_clang.yaml new file mode 100644 index 00000000..49864bc2 --- /dev/null +++ b/.github/workflows/ci_ubuntu18.04_clang.yaml @@ -0,0 +1,19 @@ +# Run Continuous Integration for the latest Ubuntu release +# This mainly checks for issues/regressions in the native build +name: native_compatibility_ubuntu18.04_clang +on: + schedule: + - cron: '30 2 * * 0' + push: + branches: + - 'release-*' + - fix_native_compatibility_ci +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + + - name: Run tests on Ubuntu 18.04 with Clang + run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:18.04 /bin/bash -c 'apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential clang cmake gfortran git valgrind libopenmpi-dev pkg-config python3 python3-pip python3-venv curl && apt-get -y remove libssl-dev zlib1g-dev && useradd -m -d /home/muscle3 muscle3 && echo "Added user" && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && echo "Copied files" && su muscle3 -c -- "pip3 install --user -U \"pip<22\" setuptools wheel" && su muscle3 -c -- "pip3 install --user \"ymmsl>=0.12.0,<0.13\" qcg-pilotjob==0.13.1" && su muscle3 -c -- "cd /home/muscle3/muscle3 && CXXFLAGS=-fPIE OMPI_CXX=clang++ CXX=clang++ make test_examples"' diff --git a/.github/workflows/ci_ubuntu20.04.yaml b/.github/workflows/ci_ubuntu20.04.yaml index c9d0c595..f3c51d10 100644 --- a/.github/workflows/ci_ubuntu20.04.yaml +++ b/.github/workflows/ci_ubuntu20.04.yaml @@ -3,7 +3,7 @@ name: native_compatibility_ubuntu20.04 on: schedule: - - cron: '0 4 * * 0' + - cron: '0 3 * * 0' push: branches: - 'release-*' diff --git a/.github/workflows/ci_ubuntu20.04_clang.yaml b/.github/workflows/ci_ubuntu20.04_clang.yaml new file mode 100644 index 00000000..749aca38 --- /dev/null +++ b/.github/workflows/ci_ubuntu20.04_clang.yaml @@ -0,0 +1,19 @@ +# Run Continuous Integration for the latest Ubuntu release +# This mainly checks for issues/regressions in the native build +name: native_compatibility_ubuntu20.04_clang +on: + schedule: + - cron: '30 3 * * 0' + push: + branches: + - 'release-*' + - fix_native_compatibility_ci +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + + - name: Run tests on Ubuntu 20.04 with Clang + run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:20.04 /bin/bash -c 'apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential clang cmake gfortran git valgrind libopenmpi-dev pkg-config python3 python3-pip python3-venv curl && apt-get -y remove libssl-dev zlib1g-dev && useradd -m -d /home/muscle3 muscle3 && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && su muscle3 -c -- "pip3 install --user -U \"pip<22\" setuptools wheel" && su muscle3 -c -- "pip3 install --user \"ymmsl>=0.12.0,<0.13\" qcg-pilotjob==0.13.1" && su muscle3 -c -- "cd /home/muscle3/muscle3 && CXXFLAGS=-fPIE OMPI_CXX=clang++ CXX=clang++ make test_examples"' diff --git a/.github/workflows/ci_ubuntu_22.04_clang.yaml b/.github/workflows/ci_ubuntu22.04_clang.yaml similarity index 95% rename from .github/workflows/ci_ubuntu_22.04_clang.yaml rename to .github/workflows/ci_ubuntu22.04_clang.yaml index 125b3fe6..3c20e9e8 100644 --- a/.github/workflows/ci_ubuntu_22.04_clang.yaml +++ b/.github/workflows/ci_ubuntu22.04_clang.yaml @@ -3,12 +3,11 @@ name: native_compatibility_ubuntu22.04_clang on: schedule: - - cron: '0 3 * * 0' + - cron: '30 4 * * 0' push: branches: - 'release-*' - fix_native_compatibility_ci - - feature/clang_build jobs: build: runs-on: ubuntu-latest From 1c0f6820833bb17b540502ca43478bbe7a746c2b Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Sat, 19 Nov 2022 19:38:51 +0100 Subject: [PATCH 085/183] Add building with clang to documentation (thanks Maarten) --- docs/source/installing.rst.in | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/docs/source/installing.rst.in b/docs/source/installing.rst.in index 24c0dcf2..cbafedad 100644 --- a/docs/source/installing.rst.in +++ b/docs/source/installing.rst.in @@ -86,6 +86,12 @@ helpdesk. ``cmake`` is only needed to build the MessagePack dependency, so if that's already available then you don't need ``cmake```. On a cluster, there is usually a ``cmake`` module to load. +MUSCLE3 can be built with **clang** as well, if you prefer. You'll need to +install it using something like ``sudo apt-get install clang``, and modify the +build command a bit, see below. Note that clang does not have a production-ready +Fortran compiler yet, but the commands below will help you build the C++ part +with clang, and the Fortran part with gfortran. + If your submodels use MPI, then you'll need to compile the MPI support for MUSCLE3. This requires an MPI library to be available. Libmuscle has been tested with OpenMPI on Ubuntu, but should work with other MPI implementations @@ -203,6 +209,20 @@ As an example, to build libmuscle using 2 cores, you would do: This will take a few minutes (including building the dependencies), depending on the speed of your machine. +**Building with clang** + +To build with clang, use + +.. code-block:: bash + + ~/mucle3_source/muscle3-0.5.0$ CXXFLAGS=-fPIE OMPI_CXX=clang++ CXX=clang++ make + + +This will tell the build system to use clang for compiling the C++ code and its +MPI support, but still use gfortran to compile the Fortran code (if gfortran is +installed). The extra ``-fPIE`` switch is needed to make that combination work +on some common platforms. + Getting help ```````````` From ee5fbc1714e6f4cebd1c8e86512b8275f9d5e491 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 21 Nov 2022 13:22:27 +0100 Subject: [PATCH 086/183] Fix checkpointing bugs --- libmuscle/python/libmuscle/instance.py | 10 ++--- .../python/libmuscle/snapshot_manager.py | 44 ++++++++++++------- .../libmuscle/test/test_snapshot_manager.py | 3 +- 3 files changed, 35 insertions(+), 22 deletions(-) diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index a7e5ec63..379067c8 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -147,6 +147,8 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool: self._snapshot_manager.reuse_instance(snapshot_path) if not do_reuse: + self.__close_ports() + self._communicator.shutdown() self._deregister() self.__manager.close() @@ -568,7 +570,7 @@ def save_final_snapshot(self, message: Message) -> None: (msg.timestamp for msg in self._f_init_cache.values()), default=None) return self._snapshot_manager.save_final_snapshot( - message, f_init_max_timestamp) + message, f_init_max_timestamp, self._do_reuse) def _register(self) -> None: """Register this instance with the manager. @@ -657,8 +659,7 @@ def __check_reuse_instance(self, apply_overlay: bool) -> bool: # TODO: _f_init_cache should be empty here, or the user didn't # receive something that was sent on the last go-around. # At least emit a warning. - if not (self.resuming() and self._first_run): - # when resuming we skip receiving on f_init in the first run + if self.should_init() or not self._first_run: self.__pre_receive_f_init(apply_overlay) self._set_local_log_level() @@ -678,9 +679,6 @@ def __check_reuse_instance(self, apply_overlay: bool) -> bool: do_reuse = False self._first_run = False - if not do_reuse: - self.__close_ports() - self._communicator.shutdown() return do_reuse def __receive_message( diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py index 5f67bfed..9b4c2da3 100644 --- a/libmuscle/python/libmuscle/snapshot_manager.py +++ b/libmuscle/python/libmuscle/snapshot_manager.py @@ -1,9 +1,9 @@ import logging from datetime import datetime from pathlib import Path -from typing import Optional, cast +from typing import Optional -from ymmsl import Checkpoints, Reference +from ymmsl import Checkpoints, Reference, Operator from libmuscle.checkpoint_triggers import TriggerManager from libmuscle.communicator import Communicator, Message @@ -65,8 +65,8 @@ def _set_checkpoint_info(self, """ self._trigger_manager.set_checkpoint_info(utc_reference, checkpoints) if resume is not None: - self.__load_snapshot(resume) - snapshot = cast(Snapshot, self._resume_from_snapshot) + snapshot = self.load_snapshot_from_file(resume) + self._resume_from_snapshot = snapshot self._communicator.restore_message_counts( snapshot.port_message_counts) self._trigger_manager.update_checkpoints( @@ -136,14 +136,16 @@ def save_snapshot(self, msg: Message) -> None: self.__save_snapshot(msg, False) def save_final_snapshot( - self, msg: Message, f_init_max_timestamp: Optional[float]) -> None: + self, msg: Message, f_init_max_timestamp: Optional[float], + do_reuse: Optional[bool]) -> None: """Save final snapshot contained in the message object """ - self.__save_snapshot(msg, True, f_init_max_timestamp) + self.__save_snapshot(msg, True, f_init_max_timestamp, do_reuse) def __save_snapshot( self, msg: Message, final: bool, - f_init_max_timestamp: Optional[float] = None + f_init_max_timestamp: Optional[float] = None, + do_reuse: Optional[bool] = None ) -> None: """Actual implementation used by save_(final_)snapshot. @@ -155,6 +157,18 @@ def __save_snapshot( wallclock_time = self._trigger_manager.elapsed_walltime() port_message_counts = self._communicator.get_message_counts() + if final: + # Decrease F_INIT port counts by one: F_INIT messages are already + # pre-received, but not yet processed by the user code. Therefore, + # the snapshot state should treat these as not-received. + all_ports = self._communicator.list_ports() + ports = all_ports.get(Operator.F_INIT, []) + if self._communicator.settings_in_connected(): + ports.append('muscle_settings_in') + for port_name in ports: + new_counts = [i - 1 for i in port_message_counts[port_name]] + port_message_counts[port_name] = new_counts + snapshot = MsgPackSnapshot( triggers, wallclock_time, port_message_counts, final, msg) @@ -169,7 +183,8 @@ def __save_snapshot( timestamp = f_init_max_timestamp self._trigger_manager.update_checkpoints(timestamp, final) - def __load_snapshot(self, snapshot_location: Path) -> None: + @staticmethod + def load_snapshot_from_file(snapshot_location: Path) -> Snapshot: """Load a previously stored snapshot from the filesystem Args: @@ -186,13 +201,12 @@ def __load_snapshot(self, snapshot_location: Path) -> None: data = snapshot_file.read() if version == MsgPackSnapshot.SNAPSHOT_VERSION_BYTE: - self._resume_from_snapshot = MsgPackSnapshot.from_bytes(data) - else: - raise RuntimeError('Unable to load snapshot from' - f' {snapshot_location}: unknown version of' - ' snapshot file. Was the file saved with a' - ' different version of libmuscle or' - ' tampered with?') + return MsgPackSnapshot.from_bytes(data) + raise RuntimeError('Unable to load snapshot from' + f' {snapshot_location}: unknown version of' + ' snapshot file. Was the file saved with a' + ' different version of libmuscle or' + ' tampered with?') def __store_snapshot(self, snapshot: Snapshot) -> Path: """Store a snapshot on the filesystem diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py index 972e409b..7dbce076 100644 --- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py +++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py @@ -87,7 +87,8 @@ def test_save_load_checkpoint(tmp_path: Path) -> None: assert not snapshot_manager2.should_save_snapshot(0.4) assert snapshot_manager2.should_save_final_snapshot(True, 1.2) - snapshot_manager2.save_final_snapshot(Message(0.6, None, 'test data2'), 1.2) + snapshot_manager2.save_final_snapshot( + Message(0.6, None, 'test data2'), 1.2, True) instance, metadata = manager.submit_snapshot_metadata.call_args[0] assert instance == instance_id From f2b022eb534511883244130e1b748794d1da5c39 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 21 Nov 2022 13:49:39 +0100 Subject: [PATCH 087/183] Add snapshot type (final/interm.) in resume ymmsl --- .../python/libmuscle/manager/snapshot_registry.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py index 6bdad7fa..d0b75206 100644 --- a/libmuscle/python/libmuscle/manager/snapshot_registry.py +++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py @@ -419,15 +419,17 @@ def _generate_description( component_info.append(( str(node.instance), f'{node.snapshot.timestamp:<11.6g}', - f'{node.snapshot.wallclock_time:<11.6g}')) + f'{node.snapshot.wallclock_time:<11.6g}', + ("Intermediate", "Final")[node.snapshot.is_final_snapshot])) max_instance_len = max(max_instance_len, len(str(node.instance))) instance_with_padding = 'Instance'.ljust(max_instance_len) component_table = [ - f'{instance_with_padding} t wallclock time', - f'{"-" * (max_instance_len + 27)}'] + f'{instance_with_padding} t Wallclock time Type', + f'{"-" * (max_instance_len + 41)}'] component_table += [ f'{name.ljust(max_instance_len)} {timestamp} {walltime}' - for name, timestamp, walltime in component_info] + f' {typ}' + for name, timestamp, walltime, typ in component_info] return (f'Workflow snapshot for {self._model.name}' f' taken on {now.strftime("%Y-%m-%d %H:%M:%S")}.\n' 'Snapshot triggers:\n' + From e881d20c2021361bba7857f18d5cd5747dfae133 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 21 Nov 2022 13:50:08 +0100 Subject: [PATCH 088/183] Add command line tool to display snapshot info --- muscle3/muscle3.py | 57 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 50 insertions(+), 7 deletions(-) diff --git a/muscle3/muscle3.py b/muscle3/muscle3.py index 6682215f..484e4335 100644 --- a/muscle3/muscle3.py +++ b/muscle3/muscle3.py @@ -1,12 +1,16 @@ import sys +from collections import OrderedDict +from pathlib import Path from typing import Sequence import click import ymmsl -from ymmsl import Identifier, PartialConfiguration +from ymmsl import PartialConfiguration -from libmuscle.planner.planner import Planner, Resources +from libmuscle.planner.planner import ( + Planner, Resources, InsufficientResourcesAvailable) +from libmuscle.snapshot_manager import SnapshotManager _RESOURCES_INCOMPLETE_MODEL = """ @@ -17,13 +21,11 @@ @click.group() -def muscle3(): +def muscle3() -> None: """MUSCLE3 command line interface - In the future, this command will provide various functions for - running coupled simulations using MUSCLE3. For now, it does only - one thing, which is to calculate the number of cluster nodes - needed for a given simulation to run without oversubscribing. + This command provides various functions for running coupled simulations + using MUSCLE3. Use muscle3 --help for help with individual commands. """ @@ -107,6 +109,47 @@ def resources( sys.exit(0) +@muscle3.command(short_help='Display details of a stored snapshot') +@click.argument( + 'snapshot_files', nargs=-1, required=True, type=click.Path( + exists=True, file_okay=True, dir_okay=False, readable=True, + allow_dash=True, resolve_path=True, path_type=Path)) +@click.option( + '-d', '--data', is_flag=True, + help='Display stored data. Note this may result in a lot of output!') +@click.option( + '-v', '--verbose', is_flag=True, help='Display more metadata.') +def snapshot( + snapshot_files: Sequence[Path], data: bool, verbose: bool) -> None: + """Display information about stored snapshots. + + Per provided snapshot, display metadata. Stored data can also be output by + supplying the '-d' or '--data' flags. Note that this may result in a lot of + data displayed. + """ + for file in snapshot_files: + snapshot = SnapshotManager.load_snapshot_from_file(file) + click.echo(f'Snapshot at {file}:') + typ = 'Final' if snapshot.is_final_snapshot else 'Intermediate' + properties = OrderedDict([ + ('Snapshot type', typ), + ('Snapshot timestamp', snapshot.message.timestamp), + ('Snapshot wallclock time', snapshot.wallclock_time), + ('Snapshot triggers', snapshot.triggers), + ]) + if verbose: + properties.update([ + ('Internal: Port message counts', snapshot.port_message_counts), + ]) + for prop_name, prop_value in properties.items(): + click.secho(f'{prop_name}: ', nl=False, bold=True) + click.echo(prop_value) + if data: + click.secho('Snapshot data:', bold=True) + click.echo(snapshot.message.data) + click.echo() + + def _load_ymmsl_files(ymmsl_files: Sequence[str]) -> PartialConfiguration: """Loads and merges yMMSL files.""" configuration = PartialConfiguration() From 6b92e5bc67ce4286158182e35b60dae12a5fd577 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 21 Nov 2022 14:42:43 +0100 Subject: [PATCH 089/183] Set TCP options in Python Set TCP_NODELAY and TCP_QUICKACK (like the C++ code already did) --- libmuscle/python/libmuscle/mcp/tcp_transport_client.py | 4 ++++ libmuscle/python/libmuscle/mcp/tcp_transport_server.py | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/libmuscle/python/libmuscle/mcp/tcp_transport_client.py b/libmuscle/python/libmuscle/mcp/tcp_transport_client.py index cd976cae..88e68510 100644 --- a/libmuscle/python/libmuscle/mcp/tcp_transport_client.py +++ b/libmuscle/python/libmuscle/mcp/tcp_transport_client.py @@ -43,6 +43,10 @@ def __init__(self, location: str) -> None: raise RuntimeError('Could not connect to the server at location' ' {}'.format(location)) else: + if hasattr(socket, "TCP_NODELAY"): + sock.setsockopt(socket.SOL_TCP, socket.TCP_NODELAY, 1) + if hasattr(socket, "TCP_QUICKACK"): + sock.setsockopt(socket.SOL_TCP, socket.TCP_QUICKACK, 1) self._socket = sock def call(self, request: bytes) -> bytes: diff --git a/libmuscle/python/libmuscle/mcp/tcp_transport_server.py b/libmuscle/python/libmuscle/mcp/tcp_transport_server.py index 2219cd76..17831064 100644 --- a/libmuscle/python/libmuscle/mcp/tcp_transport_server.py +++ b/libmuscle/python/libmuscle/mcp/tcp_transport_server.py @@ -1,3 +1,4 @@ +import socket import socketserver as ss import threading from typing import cast, List, Optional, Tuple @@ -19,6 +20,10 @@ def __init__(self, host_port_tuple: Tuple[str, int], ) -> None: super().__init__(host_port_tuple, streamhandler) self.transport_server = transport_server + if hasattr(socket, "TCP_NODELAY"): + self.socket.setsockopt(socket.SOL_TCP, socket.TCP_NODELAY, 1) + if hasattr(socket, "TCP_QUICKACK"): + self.socket.setsockopt(socket.SOL_TCP, socket.TCP_QUICKACK, 1) class TcpHandler(ss.BaseRequestHandler): From 51ed169fcee6fbae30e4048e708da8c0966dc103 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 22 Nov 2022 16:04:36 +0100 Subject: [PATCH 090/183] Rewrite macro/micro snapshot integration test - Use new paradigm for running actors in integration test - Update checks for changed mechanism --- integration_test/test_snapshot_macro_micro.py | 96 ++++++------------- 1 file changed, 28 insertions(+), 68 deletions(-) diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py index ae657b5b..0de8fc47 100644 --- a/integration_test/test_snapshot_macro_micro.py +++ b/integration_test/test_snapshot_macro_micro.py @@ -1,10 +1,9 @@ -import sys +from .conftest import run_manager_with_actors import pytest -from ymmsl import Operator, load +from ymmsl import Operator, load, dump from libmuscle import Instance, Message -from libmuscle.manager.manager import Manager from libmuscle.manager.run_dir import RunDir @@ -46,7 +45,7 @@ def macro(): if instance.should_save_snapshot(t_cur): instance.save_snapshot(Message(t_cur, None, i)) - if instance.should_save_final_snapshot(t_cur): + if instance.should_save_final_snapshot(): instance.save_final_snapshot(Message(t_cur, None, i)) @@ -136,42 +135,23 @@ def base_config(): macro.o_i: micro.f_i micro.o_f: macro.s settings: - macro.t0: 0.12 + macro.t0: 0.14 macro.dt: 0.17 macro.t_max: 1.9 micro.dt: 0.009 micro.t_max: 0.1 muscle_remote_log_level: {_LOG_LEVEL} -implementations: - macro_implementation: - executable: {sys.executable} - args: - - {__file__} - - macro - supports_checkpoint: true - micro_implementation: - executable: {sys.executable} - args: - - {__file__} - - micro - supports_checkpoint: true -resources: - macro: - threads: 1 - micro: - threads: 1 checkpoints: + at_end: true simulation_time: - every: 0.4""") -@pytest.mark.skip("To be updated") def test_snapshot_macro_micro(tmp_path, base_config): - base_config.check_consistent() run_dir1 = RunDir(tmp_path / 'run1') - manager = Manager(base_config, run_dir1, _LOG_LEVEL) - manager.start_instances() - assert manager.wait() + run_manager_with_actors( + dump(base_config), run_dir1.path, + python_actors={'macro': macro, 'micro': micro}) # Note: sorted only works because we have fewer than 10 snapshots, otherwise # _10 would be sorted right after _1 @@ -183,74 +163,54 @@ def test_snapshot_macro_micro(tmp_path, base_config): snapshot_docs = list(map(load, snapshots_ymmsl)) assert snapshot_docs[0].resume['macro'] == macro_snapshots[0] assert snapshot_docs[0].resume['micro'] == micro_snapshots[0] - assert snapshot_docs[1].resume['macro'] == macro_snapshots[1] - assert snapshot_docs[1].resume['micro'] == micro_snapshots[0] + assert snapshot_docs[1].resume['macro'] == macro_snapshots[0] + assert snapshot_docs[1].resume['micro'] == micro_snapshots[1] for i in range(2, 7): assert snapshot_docs[i].resume['macro'] == macro_snapshots[i - 1] assert snapshot_docs[i].resume['micro'] == micro_snapshots[i - 1] - base_config.update(snapshot_docs[4]) - del base_config.settings['muscle_snapshot_directory'] - base_config.check_consistent() - run_dir2 = RunDir(tmp_path / 'run2') - manager = Manager(base_config, run_dir2, _LOG_LEVEL) - manager.start_instances() - assert manager.wait() + base_config.update(snapshot_docs[4]) # concatenate resume info + run_manager_with_actors( + dump(base_config), run_dir2.path, + python_actors={'macro': macro, 'micro': micro}) macro_snapshots = sorted(run_dir2.snapshot_dir().glob('macro*')) assert len(macro_snapshots) == 2 # 1.6, final micro_snapshots = sorted(run_dir2.snapshot_dir().glob('micro*')) - assert len(micro_snapshots) == 3 # 1.2, 1.6, final + assert len(micro_snapshots) == 2 # 1.6, final snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl')) assert len(snapshots_ymmsl) == 2 -@pytest.mark.skip("To be updated") def test_snapshot_macro_vector_micro(tmp_path, base_config): - macro_implementation = base_config.implementations['macro_implementation'] - macro_implementation.args[-1] = 'macro_vector' base_config.model.components[1].multiplicity = [2] - base_config.check_consistent() run_dir1 = RunDir(tmp_path / 'run1') - manager = Manager(base_config, run_dir1, _LOG_LEVEL) - manager.start_instances() - assert manager.wait() + run_manager_with_actors( + dump(base_config), run_dir1.path, + python_actors={'macro': macro_vector, + 'micro[0]': micro, + 'micro[1]': micro}) macro_snapshots = sorted(run_dir1.snapshot_dir().glob('macro*')) assert len(macro_snapshots) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final micro_snapshots = sorted(run_dir1.snapshot_dir().glob('micro*')) assert len(micro_snapshots) == 6 * 2 # 0, 0.4, 0.8, 1.2, 1.6, final snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl')) - # iff micro[0] snapshots before micro[1] at t==0.4, an additional workflow - # snapshot can be created - assert len(snapshots_ymmsl) in (7, 8) - - snapshot_docs = list(map(load, sorted(snapshots_ymmsl))) - base_config.update(snapshot_docs[-3]) - del base_config.settings['muscle_snapshot_directory'] - base_config.check_consistent() + assert len(snapshots_ymmsl) == 8 run_dir2 = RunDir(tmp_path / 'run2') - manager = Manager(base_config, run_dir2, _LOG_LEVEL) - manager.start_instances() - assert manager.wait() + base_config.update(load(snapshots_ymmsl[-3])) # concatenate resume info + run_manager_with_actors( + dump(base_config), run_dir2.path, + python_actors={'macro': macro_vector, + 'micro[0]': micro, + 'micro[1]': micro}) macro_snapshots = sorted(run_dir2.snapshot_dir().glob('macro*')) assert len(macro_snapshots) == 2 # 1.6, final micro_snapshots = sorted(run_dir2.snapshot_dir().glob('micro*')) - assert len(micro_snapshots) == 3 * 2 # 1.2, 1.6, final + assert len(micro_snapshots) == 2 * 2 # 1.6, final snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl')) assert len(snapshots_ymmsl) == 2 - - -if __name__ == "__main__": - if 'macro' in sys.argv: - macro() - elif 'macro_vector' in sys.argv: - macro_vector() - elif 'micro' in sys.argv: - micro() - else: - raise RuntimeError('Specify macro or micro on the command line') From 76e7adf9f1edf73624ebc781864f31a4af0b6edc Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Wed, 23 Nov 2022 10:32:28 +0100 Subject: [PATCH 091/183] Add implicit checkpoint for restarting an instance --- integration_test/test_snapshot_macro_micro.py | 30 +++++++++++++------ .../libmuscle/manager/snapshot_registry.py | 18 ++++++++--- 2 files changed, 35 insertions(+), 13 deletions(-) diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py index 0de8fc47..9f769944 100644 --- a/integration_test/test_snapshot_macro_micro.py +++ b/integration_test/test_snapshot_macro_micro.py @@ -161,16 +161,19 @@ def test_snapshot_macro_micro(tmp_path, base_config): assert len(micro_snapshots) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl')) snapshot_docs = list(map(load, snapshots_ymmsl)) - assert snapshot_docs[0].resume['macro'] == macro_snapshots[0] + assert 'macro' not in snapshot_docs[0].resume assert snapshot_docs[0].resume['micro'] == micro_snapshots[0] assert snapshot_docs[1].resume['macro'] == macro_snapshots[0] - assert snapshot_docs[1].resume['micro'] == micro_snapshots[1] - for i in range(2, 7): - assert snapshot_docs[i].resume['macro'] == macro_snapshots[i - 1] - assert snapshot_docs[i].resume['micro'] == micro_snapshots[i - 1] - + assert snapshot_docs[1].resume['micro'] == micro_snapshots[0] + assert snapshot_docs[2].resume['macro'] == macro_snapshots[0] + assert snapshot_docs[2].resume['micro'] == micro_snapshots[1] + for i in range(3, 8): + assert snapshot_docs[i].resume['macro'] == macro_snapshots[i - 2] + assert snapshot_docs[i].resume['micro'] == micro_snapshots[i - 2] + + # resume from the snapshots taken at t>=1.2 run_dir2 = RunDir(tmp_path / 'run2') - base_config.update(snapshot_docs[4]) # concatenate resume info + base_config.update(snapshot_docs[5]) # add resume info run_manager_with_actors( dump(base_config), run_dir2.path, python_actors={'macro': macro, 'micro': micro}) @@ -182,6 +185,15 @@ def test_snapshot_macro_micro(tmp_path, base_config): snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl')) assert len(snapshots_ymmsl) == 2 + # resume from the first workflow snapshot (this restarts macro from scratch) + run_dir3 = RunDir(tmp_path / 'run3') + base_config.resume = {} # clear resume information + base_config.update(snapshot_docs[0]) # add resume info + base_config.settings['macro.t_max'] = 0.6 # run shorter + run_manager_with_actors( + dump(base_config), run_dir3.path, + python_actors={'macro': macro, 'micro': micro}) + def test_snapshot_macro_vector_micro(tmp_path, base_config): base_config.model.components[1].multiplicity = [2] @@ -198,10 +210,10 @@ def test_snapshot_macro_vector_micro(tmp_path, base_config): micro_snapshots = sorted(run_dir1.snapshot_dir().glob('micro*')) assert len(micro_snapshots) == 6 * 2 # 0, 0.4, 0.8, 1.2, 1.6, final snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl')) - assert len(snapshots_ymmsl) == 8 + assert len(snapshots_ymmsl) == 10 run_dir2 = RunDir(tmp_path / 'run2') - base_config.update(load(snapshots_ymmsl[-3])) # concatenate resume info + base_config.update(load(snapshots_ymmsl[-3])) # add resume info run_manager_with_actors( dump(base_config), run_dir2.path, python_actors={'macro': macro_vector, diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py index d0b75206..cbb8bbde 100644 --- a/libmuscle/python/libmuscle/manager/snapshot_registry.py +++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py @@ -194,7 +194,12 @@ def __init__( self._instances = set() # type: Set[Reference] for component in config.model.components: self._instances.update(component.instances()) - # TODO: create snapshot nodes for starting from scratch + + # Create snapshot nodes for starting from scratch + self._null_snapshot = SnapshotMetadata( + ["Instance start"], 0, 0, None, {}, True, '') + for instance in self._instances: + self.register_snapshot(instance, self._null_snapshot) def register_snapshot( self, instance: Reference, snapshot: SnapshotMetadata) -> None: @@ -243,7 +248,8 @@ def _add_snapshot( peer_snapshot, self._get_connections(instance, peer)) # finally, check if this snapshotnode is now part of a workflow snapshot - self._save_workflow_snapshot(snapshotnode) + if snapshot is not self._null_snapshot: + self._save_workflow_snapshot(snapshotnode) def _save_workflow_snapshot(self, snapshotnode: SnapshotNode) -> None: """Save snapshot if a workflow snapshot exists with the provided node. @@ -402,7 +408,11 @@ def _generate_snapshot_config( selected_snapshots.sort(key=attrgetter('instance')) resume = {} for node in selected_snapshots: - resume[node.instance] = Path(node.snapshot.snapshot_filename) + if node.snapshot is not self._null_snapshot: + # Only store resume information when it is an actual snapshot + # created by the instance. Otherwise the instance can just be + # restarted from the beginning. + resume[node.instance] = Path(node.snapshot.snapshot_filename) description = self._generate_description(selected_snapshots, now) return PartialConfiguration(resume=resume, description=description) @@ -436,7 +446,7 @@ def _generate_description( '\n'.join(f'- {trigger} ({", ".join(triggers[trigger])})' for trigger in sorted(triggers)) + '\n\n' + - '\n'.join(component_table)) + '\n'.join(component_table) + '\n') def _cleanup_snapshots( self, workflow_snapshots: List[List[SnapshotNode]]) -> None: From 775834e27c7a6304a18de4261626b88e6d20d4da Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 24 Nov 2022 13:46:36 +0100 Subject: [PATCH 092/183] Implicit checkpointing for stateless instances --- integration_test/test_snapshot_macro_micro.py | 55 ++++++++++++- .../python/libmuscle/checkpoint_triggers.py | 7 ++ libmuscle/python/libmuscle/instance.py | 35 ++++++--- libmuscle/python/libmuscle/snapshot.py | 14 ++-- .../python/libmuscle/snapshot_manager.py | 77 +++++++++++++------ .../python/libmuscle/test/test_snapshot.py | 12 +++ .../libmuscle/test/test_snapshot_manager.py | 63 ++++++++++++--- 7 files changed, 212 insertions(+), 51 deletions(-) diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py index 9f769944..f9c14103 100644 --- a/integration_test/test_snapshot_macro_micro.py +++ b/integration_test/test_snapshot_macro_micro.py @@ -1,7 +1,7 @@ from .conftest import run_manager_with_actors import pytest -from ymmsl import Operator, load, dump +from ymmsl import ImplementationState, Operator, load, dump from libmuscle import Instance, Message from libmuscle.manager.run_dir import RunDir @@ -123,6 +123,28 @@ def micro(): instance.save_final_snapshot(Message(t_cur, None, [i, t_stop])) +def stateless_micro(): + instance = Instance({ + Operator.F_INIT: ['f_i'], + Operator.O_F: ['o_f']}, + stateful=ImplementationState.STATELESS) + + while instance.reuse_instance(): + dt = instance.get_setting('dt', 'float') + t_max = instance.get_setting('t_max', 'float') + + msg = instance.receive('f_i') + t_cur = msg.timestamp + i = msg.data + t_stop = t_cur + t_max + + while t_cur < t_stop: + # faux time-integration for testing snapshots + t_cur += dt + + instance.send('o_f', Message(t_cur, None, i)) + + @pytest.fixture def base_config(): return load(f"""ymmsl_version: v0.1 @@ -195,6 +217,37 @@ def test_snapshot_macro_micro(tmp_path, base_config): python_actors={'macro': macro, 'micro': micro}) +def test_snapshot_macro_stateless_micro(tmp_path, base_config): + run_dir1 = RunDir(tmp_path / 'run1') + run_manager_with_actors( + dump(base_config), run_dir1.path, + python_actors={'macro': macro, 'micro': stateless_micro}) + + # Note: sorted only works because we have fewer than 10 snapshots, otherwise + # _10 would be sorted right after _1 + macro_snapshots = sorted(run_dir1.snapshot_dir().glob('macro*')) + assert len(macro_snapshots) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final + micro_snapshots = sorted(run_dir1.snapshot_dir().glob('micro*')) + assert len(micro_snapshots) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final + snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl')) + snapshot_docs = list(map(load, snapshots_ymmsl)) + assert len(snapshot_docs) == 6 + + # resume from the snapshot taken at t>=1.2 + run_dir2 = RunDir(tmp_path / 'run2') + base_config.update(snapshot_docs[3]) # add resume info + run_manager_with_actors( + dump(base_config), run_dir2.path, + python_actors={'macro': macro, 'micro': stateless_micro}) + + macro_snapshots = sorted(run_dir2.snapshot_dir().glob('macro*')) + assert len(macro_snapshots) == 2 # 1.6, final + micro_snapshots = sorted(run_dir2.snapshot_dir().glob('micro*')) + assert len(micro_snapshots) == 3 # 1.2, 1.6, final + snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl')) + assert len(snapshots_ymmsl) == 2 + + def test_snapshot_macro_vector_micro(tmp_path, base_config): base_config.model.components[1].multiplicity = [2] diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py index 57e26ca7..a33a785d 100644 --- a/libmuscle/python/libmuscle/checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/checkpoint_triggers.py @@ -273,6 +273,13 @@ def should_save_final_snapshot( self._should_save_final_called = True return value + @property + def save_final_snapshot_called(self) -> bool: + """Check if :meth:`save_final_snapshot` was called during this + reuse loop. + """ + return self._saved_final_checkpoint + def reuse_instance(self) -> None: """Cleanup between instance reuse """ diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index 379067c8..f58e146f 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -8,7 +8,7 @@ from typing_extensions import Literal from ymmsl import (Identifier, Operator, SettingValue, Port, Reference, - Settings) + Settings, ImplementationState) from libmuscle.communicator import Communicator, Message from libmuscle.settings_manager import SettingsManager @@ -34,7 +34,8 @@ class Instance: This class provides a low-level send/receive API for the instance to use. """ - def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None + def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None, + stateful: ImplementationState = ImplementationState.STATEFUL ) -> None: """Create an Instance. @@ -44,6 +45,14 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None """ self.__is_shut_down = False + if not isinstance(stateful, ImplementationState): + raise ValueError( + f'Invalid value supplied for "stateful": {stateful}.' + ' Expected one of ImplementationState.STATEFUL,' + ' ImplementationState.STATELESS or ImplementationState.' + 'WEAKLY_STATEFUL.') + self._stateful = stateful + # Note that these are accessed by Muscle3, but otherwise private. self._name, self._index = self.__make_full_name() """Name and index of this instance.""" @@ -68,7 +77,8 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None """Settings for this instance.""" self._snapshot_manager = SnapshotManager( - self._instance_name(), self.__manager, self._communicator) + self._instance_name(), self.__manager, self._communicator, + self._stateful) """Keeps track of checkpointing and snapshots""" self._first_run = True @@ -144,7 +154,8 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool: snapshot_path = Path(snapshot_dir) except KeyError: snapshot_path = None - self._snapshot_manager.reuse_instance(snapshot_path) + self._snapshot_manager.reuse_instance( + snapshot_path, do_reuse, self.__f_init_max_timestamp) if not do_reuse: self.__close_ports() @@ -539,11 +550,8 @@ def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool: 'You may not call should_save_final_snapshot more than once' ' per reuse loop.') self._do_reuse = self.__check_reuse_instance(apply_overlay) - f_init_max_timestamp = max( - (msg.timestamp for msg in self._f_init_cache.values()), - default=None) return self._snapshot_manager.should_save_final_snapshot( - self._do_reuse, f_init_max_timestamp) + self._do_reuse, self.__f_init_max_timestamp) def save_final_snapshot(self, message: Message) -> None: """Save a snapshot before O_F. @@ -566,11 +574,16 @@ def save_final_snapshot(self, message: Message) -> None: attribute can be used to store the internal state of the submodel. """ - f_init_max_timestamp = max( + return self._snapshot_manager.save_final_snapshot( + message, self.__f_init_max_timestamp) + + @property + def __f_init_max_timestamp(self) -> Optional[float]: + """Return max timestamp of pre-received F_INIT messages + """ + return max( (msg.timestamp for msg in self._f_init_cache.values()), default=None) - return self._snapshot_manager.save_final_snapshot( - message, f_init_max_timestamp, self._do_reuse) def _register(self) -> None: """Register this instance with the manager. diff --git a/libmuscle/python/libmuscle/snapshot.py b/libmuscle/python/libmuscle/snapshot.py index 93ed9307..633d3f3d 100644 --- a/libmuscle/python/libmuscle/snapshot.py +++ b/libmuscle/python/libmuscle/snapshot.py @@ -21,7 +21,7 @@ def __init__(self, wallclock_time: float, port_message_counts: Dict[str, List[int]], is_final_snapshot: bool, - message: 'communicator.Message') -> None: + message: Optional['communicator.Message']) -> None: self.triggers = triggers self.wallclock_time = wallclock_time self.port_message_counts = port_message_counts @@ -74,9 +74,11 @@ def to_bytes(self) -> bytes: })) @staticmethod - def message_to_bytes(message: 'communicator.Message') -> bytes: + def message_to_bytes(message: Optional['communicator.Message']) -> bytes: """Use MPPMessage serializer for serializing the message object """ + if message is None: + return b'' settings = Settings() if message.settings is not None: settings = message.settings @@ -85,9 +87,11 @@ def message_to_bytes(message: 'communicator.Message') -> bytes: settings, 0, message.data).encoded() @staticmethod - def bytes_to_message(data: bytes) -> 'communicator.Message': + def bytes_to_message(data: bytes) -> Optional['communicator.Message']: """Use MPPMessage deserializer for serializing the message object """ + if not data: + return None mpp_message = MPPMessage.from_bytes(data) return communicator.Message(mpp_message.timestamp, mpp_message.next_timestamp, @@ -116,8 +120,8 @@ def from_snapshot(snapshot: Snapshot, snapshot_filename: str return SnapshotMetadata( snapshot.triggers, snapshot.wallclock_time, - snapshot.message.timestamp, - snapshot.message.next_timestamp, + snapshot.message.timestamp if snapshot.message else float('NaN'), + snapshot.message.next_timestamp if snapshot.message else None, snapshot.port_message_counts, snapshot.is_final_snapshot, snapshot_filename diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py index 9b4c2da3..54059375 100644 --- a/libmuscle/python/libmuscle/snapshot_manager.py +++ b/libmuscle/python/libmuscle/snapshot_manager.py @@ -1,9 +1,9 @@ import logging from datetime import datetime from pathlib import Path -from typing import Optional +from typing import cast, Optional -from ymmsl import Checkpoints, Reference, Operator +from ymmsl import Checkpoints, Reference, Operator, ImplementationState from libmuscle.checkpoint_triggers import TriggerManager from libmuscle.communicator import Communicator, Message @@ -14,6 +14,11 @@ _MAX_FILE_EXISTS_CHECK = 10000 +# error text for save_snapshot when msg = None +_NO_MESSAGE_PROVIDED = ( + 'Invalid message provided to `{}`. Please create a Message object to' + ' store the state of the instance in a snapshot.') + class SnapshotManager: """Manages information on snapshots for the Instance @@ -25,7 +30,8 @@ class SnapshotManager: def __init__(self, instance_id: Reference, manager: MMPClient, - communicator: Communicator) -> None: + communicator: Communicator, + stateful: ImplementationState) -> None: """Create a new snapshot manager Args: @@ -39,6 +45,7 @@ def __init__(self, self._safe_id = str(instance_id).replace("[", "-").replace("]", "") self._communicator = communicator self._manager = manager + self._stateful = stateful self._first_reuse = True self._trigger_manager = TriggerManager() @@ -66,19 +73,37 @@ def _set_checkpoint_info(self, self._trigger_manager.set_checkpoint_info(utc_reference, checkpoints) if resume is not None: snapshot = self.load_snapshot_from_file(resume) - self._resume_from_snapshot = snapshot + if snapshot.message is not None: + # snapshot.message is None for implicit snapshots + self._resume_from_snapshot = snapshot + self._trigger_manager.update_checkpoints( + snapshot.message.timestamp, + snapshot.is_final_snapshot) self._communicator.restore_message_counts( snapshot.port_message_counts) - self._trigger_manager.update_checkpoints( - snapshot.message.timestamp, - snapshot.is_final_snapshot) - def reuse_instance(self, snapshot_directory: Optional[Path]) -> None: + def reuse_instance(self, snapshot_directory: Optional[Path], + do_reuse: bool, f_init_max_timestamp: Optional[float] + ) -> None: """Callback on Instance.reuse_instance Args: snapshot_directory: Path to store this instance's snapshots in. + do_reuse: Used for implicit snapshots of stateless instances. See + :meth:`should_save_final_snapshot`. + f_init_max_timestamp: Used for implicit snapshots of stateless + instances. See :meth:`should_save_final_snapshot`. """ + # Implicit snapshots for stateless / weakly stateful instances + # Only create implicit snapshot if not already explicitly done + # And not in the first reuse_instance() + if (self._stateful is not ImplementationState.STATEFUL and + not self._trigger_manager.save_final_snapshot_called and + not self._first_reuse): + if self.should_save_final_snapshot(do_reuse, f_init_max_timestamp): + # create an empty message object to store + self.__save_snapshot(None, True, f_init_max_timestamp) + self._trigger_manager.reuse_instance() self._snapshot_directory = snapshot_directory @@ -110,22 +135,22 @@ def should_init(self) -> bool: self._resume_from_snapshot.is_final_snapshot) def load_snapshot(self) -> Message: - """Get the Message to resume from + """Get the Message to resume from. """ if self._resume_from_snapshot is None: raise RuntimeError('No snapshot to load. Use "instance.resuming()"' ' to check if a snapshot is available') - return self._resume_from_snapshot.message + return cast(Message, self._resume_from_snapshot.message) def should_save_snapshot(self, timestamp: float) -> bool: - """See :meth:`TriggerManager.should_save_snapshot` + """See :meth:`TriggerManager.should_save_snapshot`. """ return self._trigger_manager.should_save_snapshot(timestamp) def should_save_final_snapshot( self, do_reuse: bool, f_init_max_timestamp: Optional[float] ) -> bool: - """See :meth:`TriggerManager.should_save_final_snapshot` + """See :meth:`TriggerManager.should_save_final_snapshot`. """ return self._trigger_manager.should_save_final_snapshot( do_reuse, f_init_max_timestamp) @@ -133,25 +158,27 @@ def should_save_final_snapshot( def save_snapshot(self, msg: Message) -> None: """Save snapshot contained in the message object. """ + if not isinstance(msg, Message): + raise ValueError(_NO_MESSAGE_PROVIDED.format('save_snapshot')) self.__save_snapshot(msg, False) def save_final_snapshot( - self, msg: Message, f_init_max_timestamp: Optional[float], - do_reuse: Optional[bool]) -> None: - """Save final snapshot contained in the message object + self, msg: Message, f_init_max_timestamp: Optional[float]) -> None: + """Save final snapshot contained in the message object. """ - self.__save_snapshot(msg, True, f_init_max_timestamp, do_reuse) + if not isinstance(msg, Message): + raise ValueError(_NO_MESSAGE_PROVIDED.format('save_final_snapshot')) + self.__save_snapshot(msg, True, f_init_max_timestamp) def __save_snapshot( - self, msg: Message, final: bool, - f_init_max_timestamp: Optional[float] = None, - do_reuse: Optional[bool] = None + self, msg: Optional[Message], final: bool, + f_init_max_timestamp: Optional[float] = None ) -> None: """Actual implementation used by save_(final_)snapshot. Args: - msg: message object representing the snapshot - final: True iff called from save_final_snapshot + msg: Message object representing the snapshot. + final: True iff called from save_final_snapshot. """ triggers = self._trigger_manager.get_triggers() wallclock_time = self._trigger_manager.elapsed_walltime() @@ -176,10 +203,10 @@ def __save_snapshot( metadata = SnapshotMetadata.from_snapshot(snapshot, str(path)) self._manager.submit_snapshot_metadata(self._instance_id, metadata) - timestamp = msg.timestamp + timestamp = msg.timestamp if msg is not None else -1.0 if final and f_init_max_timestamp is not None: # For final snapshots f_init_max_snapshot is the reference time (see - # should_save_Final_snapshot). + # should_save_final_snapshot). timestamp = f_init_max_timestamp self._trigger_manager.update_checkpoints(timestamp, final) @@ -190,6 +217,7 @@ def load_snapshot_from_file(snapshot_location: Path) -> Snapshot: Args: snapshot_location: path where the snapshot is stored """ + _logger.debug(f'Loading snapshot from {snapshot_location}') if not snapshot_location.is_file(): raise RuntimeError(f'Unable to load snapshot: {snapshot_location}' ' is not a file. Please ensure this path exists' @@ -206,7 +234,7 @@ def load_snapshot_from_file(snapshot_location: Path) -> Snapshot: f' {snapshot_location}: unknown version of' ' snapshot file. Was the file saved with a' ' different version of libmuscle or' - ' tampered with?') + ' edited?') def __store_snapshot(self, snapshot: Snapshot) -> Path: """Store a snapshot on the filesystem @@ -217,6 +245,7 @@ def __store_snapshot(self, snapshot: Snapshot) -> Path: Returns: Path where the snapshot is stored """ + _logger.debug(f'Saving snapshot to {self._snapshot_directory}') if self._snapshot_directory is None: raise RuntimeError('Unknown snapshot directory. Did you try to' ' save a snapshot before entering the reuse' diff --git a/libmuscle/python/libmuscle/test/test_snapshot.py b/libmuscle/python/libmuscle/test/test_snapshot.py index c959a226..f459a001 100644 --- a/libmuscle/python/libmuscle/test/test_snapshot.py +++ b/libmuscle/python/libmuscle/test/test_snapshot.py @@ -61,3 +61,15 @@ def test_message_with_settings() -> None: snapshot2 = MsgPackSnapshot.from_bytes(binary_snapshot) assert snapshot2.message.settings.get('setting') is True + + +def test_implicit_snapshot() -> None: + message = None + snapshot = MsgPackSnapshot([], 0, {}, True, message) + assert snapshot.message is None + + binary_snapshot = snapshot.to_bytes() + assert isinstance(binary_snapshot, bytes) + + snapshot2 = MsgPackSnapshot.from_bytes(binary_snapshot) + assert snapshot2.message is None diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py index 7dbce076..16f81ce3 100644 --- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py +++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py @@ -4,7 +4,8 @@ from unittest.mock import MagicMock import pytest -from ymmsl import Reference, Checkpoints, CheckpointRangeRule +from ymmsl import ( + Reference, Checkpoints, CheckpointRangeRule, ImplementationState) from libmuscle.communicator import Message from libmuscle.snapshot import SnapshotMetadata @@ -16,13 +17,15 @@ def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path manager = MagicMock() communicator = MagicMock() communicator.get_message_counts.return_value = {} - snapshot_manager = SnapshotManager(Reference('test'), manager, communicator) + snapshot_manager = SnapshotManager( + Reference('test'), manager, communicator, + ImplementationState.STATEFUL) snapshot_manager._set_checkpoint_info( datetime.now(timezone.utc), Checkpoints(), None) assert not snapshot_manager.resuming() - snapshot_manager.reuse_instance(tmp_path) + snapshot_manager.reuse_instance(tmp_path, True, None) assert not snapshot_manager.resuming() assert not snapshot_manager.should_save_snapshot(1) assert not snapshot_manager.should_save_snapshot(5000) @@ -34,21 +37,22 @@ def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path assert "no checkpoints" in caplog.records[0].message -def test_save_load_checkpoint(tmp_path: Path) -> None: +def test_save_load_snapshot(tmp_path: Path) -> None: manager = MagicMock() communicator = MagicMock() port_message_counts = {'in': [1], 'out': [2], 'muscle_settings_in': [0]} communicator.get_message_counts.return_value = port_message_counts instance_id = Reference('test[1]') - snapshot_manager = SnapshotManager(instance_id, manager, communicator) + snapshot_manager = SnapshotManager( + instance_id, manager, communicator, ImplementationState.STATEFUL) checkpoints = Checkpoints(simulation_time=[CheckpointRangeRule(every=1)]) snapshot_manager._set_checkpoint_info( datetime.now(timezone.utc), checkpoints, None) assert not snapshot_manager.resuming() - snapshot_manager.reuse_instance(tmp_path) + snapshot_manager.reuse_instance(tmp_path, True, None) with pytest.raises(RuntimeError): snapshot_manager.load_snapshot() @@ -71,14 +75,15 @@ def test_save_load_checkpoint(tmp_path: Path) -> None: assert snapshot_path.parent == tmp_path assert snapshot_path.name == 'test-1_1.pack' - snapshot_manager2 = SnapshotManager(instance_id, manager, communicator) + snapshot_manager2 = SnapshotManager( + instance_id, manager, communicator, ImplementationState.STATEFUL) snapshot_manager2._set_checkpoint_info( datetime.now(timezone.utc), checkpoints, snapshot_path) communicator.restore_message_counts.assert_called_with(port_message_counts) assert snapshot_manager2.resuming() - snapshot_manager2.reuse_instance(tmp_path) + snapshot_manager2.reuse_instance(tmp_path, True, None) assert snapshot_manager2.resuming() msg = snapshot_manager2.load_snapshot() assert msg.timestamp == 0.2 @@ -88,7 +93,7 @@ def test_save_load_checkpoint(tmp_path: Path) -> None: assert not snapshot_manager2.should_save_snapshot(0.4) assert snapshot_manager2.should_save_final_snapshot(True, 1.2) snapshot_manager2.save_final_snapshot( - Message(0.6, None, 'test data2'), 1.2, True) + Message(0.6, None, 'test data2'), 1.2) instance, metadata = manager.submit_snapshot_metadata.call_args[0] assert instance == instance_id @@ -104,5 +109,43 @@ def test_save_load_checkpoint(tmp_path: Path) -> None: assert snapshot_path.name == 'test-1_2.pack' assert snapshot_manager2.resuming() - snapshot_manager2.reuse_instance(tmp_path) + snapshot_manager2.reuse_instance(tmp_path, True, None) assert not snapshot_manager2.resuming() + + +def test_save_load_implicit_snapshot(tmp_path: Path) -> None: + manager = MagicMock() + communicator = MagicMock() + port_message_counts = {'in': [1], 'out': [2], 'muscle_settings_in': [0]} + communicator.get_message_counts.return_value = port_message_counts + + instance_id = Reference('test[1]') + snapshot_manager = SnapshotManager( + instance_id, manager, communicator, ImplementationState.STATELESS) + + checkpoints = Checkpoints(simulation_time=[CheckpointRangeRule(every=1)]) + snapshot_manager._set_checkpoint_info( + datetime.now(timezone.utc), checkpoints, None) + + assert not snapshot_manager.resuming() + snapshot_manager.reuse_instance(tmp_path, True, None) + snapshot_manager.reuse_instance(tmp_path, True, 1.5) + manager.submit_snapshot_metadata.assert_called_once() + instance, metadata = manager.submit_snapshot_metadata.call_args[0] + assert instance == instance_id + assert isinstance(metadata, SnapshotMetadata) + snapshot_path = Path(metadata.snapshot_filename) + manager.submit_snapshot_metadata.reset_mock() + + snapshot_manager2 = SnapshotManager( + instance_id, manager, communicator, ImplementationState.STATELESS) + + snapshot_manager2._set_checkpoint_info( + datetime.now(timezone.utc), checkpoints, snapshot_path) + communicator.restore_message_counts.assert_called_with(port_message_counts) + + assert not snapshot_manager2.resuming() + snapshot_manager2.reuse_instance(tmp_path, True, 1.5) + assert not snapshot_manager2.resuming() + snapshot_manager2.reuse_instance(tmp_path, True, 2.5) + manager.submit_snapshot_metadata.assert_called_once() From 2cfcaff780b9abac700a886f550d06ae8422d684 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 24 Nov 2022 17:01:00 +0100 Subject: [PATCH 093/183] Docs: add intersphinx and update cross-refs --- docs/source/conf.py | 10 +++++ libmuscle/python/libmuscle/grid.py | 24 +++++----- libmuscle/python/libmuscle/instance.py | 62 ++++++++++++++------------ 3 files changed, 57 insertions(+), 39 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 7464d1c0..5cef345e 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -47,6 +47,7 @@ 'breathe', 'sphinx.ext.autodoc', 'sphinx.ext.autosectionlabel', + 'sphinx.ext.intersphinx', 'sphinx.ext.napoleon', 'sphinx.ext.todo', 'sphinx.ext.viewcode', @@ -109,6 +110,15 @@ breathe_default_members = ('members',) +# Configuration of sphinx.ext.intersphinx +# See https://www.sphinx-doc.org/en/master/usage/extensions/intersphinx.html +intersphinx_mapping = { + "python": ("https://docs.python.org/3/", None), + "numpy": ("https://numpy.org/doc/stable", None), + "ymmsl": ("https://ymmsl-python.readthedocs.io/en/stable", None), +} + + # -- Patch version into installation instructions -- def patch_installation_version(): with open('installing.rst', 'w') as out_file: diff --git a/libmuscle/python/libmuscle/grid.py b/libmuscle/python/libmuscle/grid.py index 34efc806..bfe26715 100644 --- a/libmuscle/python/libmuscle/grid.py +++ b/libmuscle/python/libmuscle/grid.py @@ -6,13 +6,13 @@ class Grid: """Represents a grid of data to send or receive. - Note that for received grids, the array of data is a read-only - NumPy array. If you have another array that you want to put the - received data into, use ``np.copyto(dest, source)`` to copy the - contents of the received array across into your destination array. - If you don't have an array yet and want a writable version of the - received array, use ``array.copy()`` to create a writable copy. - See the tutorial for examples. + Note that for received grids, the array of data is a read-only NumPy array. + If you have another array that you want to put the received data into, use + :external:py:func:`np.copyto(dest, source) ` to copy the + contents of the received array across into your destination array. If you + don't have an array yet and want a writable version of the received array, + use :external:py:meth:`array.copy()` to create a + writable copy. See the tutorial for examples. Attributes: array (np.ndarray): An array of data @@ -26,10 +26,12 @@ def __init__( A Grid object represents an multi-dimensional array of data. It has a type, a shape, and optionally a list of index names. - Supported data types are 4- and 8-byte integers (numpy.int32, - numpy.int64), 4- and 8-byte floats (numpy.float32, - numpy.float64), and booleans (np.bool_, np.bool8). The ``data`` - argument must be a NumPy array of one of those types. + Supported data types are 4- and 8-byte integers + (:external:py:attr:`numpy.int32`, :external:py:attr:`numpy.int64`), + 4- and 8-byte floats (:external:py:attr:`numpy.float32`, + :external:py:attr:`numpy.float64`), and booleans + (:external:py:class:`numpy.bool_`, :external:py:attr:`numpy.bool8`). The + ``data`` argument must be a NumPy array of one of those types. If ``indexes`` is given, then it must be a list of strings of the same length as the number of dimensions of ``data``, and diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index f58e146f..03f3d494 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -40,8 +40,12 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None, """Create an Instance. Args: - ports: A list of port names for each operator of this - component. + ports: A list of port names for each + :external:py:class:`~ymmsl.Operator` of this component. + stateful: Indicate whether this instance carries state between + iterations of the reuse loop. See + :external:py:class:`ymmsl.ImplementationState` for a description + of the options. """ self.__is_shut_down = False @@ -124,7 +128,7 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool: overlay or to save it. If you're going to use :meth:`receive_with_settings` on your F_INIT ports, set this to False. If you don't know what that means, - just call `reuse_instance()` without specifying this + just call :meth:`reuse_instance()` without specifying this and everything will be fine. If it turns out that you did need to specify False, MUSCLE3 will tell you about it in an error message and you can add it still. @@ -236,7 +240,7 @@ def get_setting(self, name: str, typ: Optional[str] = None self._instance_name(), Reference(name), typ) def list_ports(self) -> Dict[Operator, List[str]]: - """Returns a description of the ports that this CE has. + """Returns a description of the ports that this Instance has. Note that the result has almost the same format as the port declarations you pass when making an Instance. The only @@ -244,9 +248,9 @@ def list_ports(self) -> Dict[Operator, List[str]]: even if the port is a vector port. Returns: - A dictionary, indexed by Operator, containing lists of - port names. Operators with no associated ports are not - included. + A dictionary, indexed by :external:py:class:`~ymmsl.Operator`, + containing lists of port names. Operators with no associated ports + are not included. """ return self._communicator.list_ports() @@ -299,7 +303,8 @@ def get_port_length(self, port: str) -> int: Args: port: The name of the port to measure. - Raises: RuntimeError if this is a scalar port. + Raises: + RuntimeError: If this is a scalar port. """ return self._communicator.get_port(port).get_length() @@ -307,7 +312,7 @@ def set_port_length(self, port: str, length: int) -> None: """Resizes the port to the given length. You should check whether the port is resizable using - `is_resizable()` first; whether it is depends on how this + :meth:`is_resizable()` first; whether it is depends on how this component is wired up, so you should check. Args: @@ -324,7 +329,7 @@ def send(self, port_name: str, message: Message, """Send a message to the outside world. Sending is non-blocking, a copy of the message will be made - and stored until the receiver is ready to receive it. + and stored in memory until the receiver is ready to receive it. Args: port_name: The port on which this message is to be sent. @@ -410,8 +415,9 @@ def receive_with_settings( def snapshots_enabled(self) -> bool: """Check if the current workflow has snapshots enabled. - When snapshots are not enabled, all calls to should_save_snapshot and - should_save_final_snapshot will return False. + When snapshots are not enabled, all calls to + :meth:`should_save_snapshot` and :meth:`should_save_final_snapshot` will + return False. Returns: True iff checkpoint rules are defined in the workflow yMMSL. @@ -446,7 +452,7 @@ def should_init(self) -> bool: before attempting to receive data on F_INIT ports. Returns: - True iff the submodel must skip the F_INIT step + True if the submodel must execute the F_INIT step, False otherwise. """ return self._snapshot_manager.should_init() @@ -465,7 +471,8 @@ def load_snapshot(self) -> Message: return self._snapshot_manager.load_snapshot() def should_save_snapshot(self, timestamp: float) -> bool: - """Check if a snapshot should be saved inside a time-integration loop. + """Check if a snapshot should be saved after the S Operator of the + submodel. This method checks if a snapshot should be saved right now, based on the provided timestamp and passed wallclock time. @@ -487,7 +494,7 @@ def should_save_snapshot(self, timestamp: float) -> bool: return self._snapshot_manager.should_save_snapshot(timestamp) def save_snapshot(self, message: Message) -> None: - """Save a snapshot inside a time-integration loop. + """Save a snapshot after the S Operator of the submodel. Before saving a snapshot, you should check using :meth:`should_save_snapshot` if a snapshot should be saved according to @@ -513,14 +520,13 @@ def save_snapshot(self, message: Message) -> None: return self._snapshot_manager.save_snapshot(message) def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool: - """Check if a snapshot should be saved before O_F. + """Check if a snapshot should be saved at the end of the reuse loop. - This method checks if a snapshot should be saved right now, based on the - provided timestamp and passed wallclock time. + This method checks if a snapshot should be saved now. When this method returns True, the submodel must also save a snapshot - through :meth:`save_final_snapshot`. A RuntimeError will be generated - when not doing so. + through :meth:`save_final_snapshot`. A :class:`RuntimeError` will be + generated when not doing so. See also :meth:`should_save_snapshot` for the variant that may be called inside of a time-integration loop of the submodel. @@ -528,7 +534,7 @@ def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool: .. note:: This method will block until it can determine whether a final snapshot should be taken. This means it must also determine if this - instance is reused. The optional keword-only argument + instance is reused. The optional keyword-only argument `apply_overlay` has the same meaning as for :meth:`reuse_instance`. Args: @@ -536,10 +542,10 @@ def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool: overlay or to save it. If you're going to use :meth:`receive_with_settings` on your F_INIT ports, set this to False. If you don't know what that means, just call - `reuse_instance()` without specifying this and everything will - be fine. If it turns out that you did need to specify False, - MUSCLE3 will tell you about it in an error message and you can - add it still. + :meth:`should_save_final_snapshot()` without specifying this and + everything will be fine. If it turns out that you did need to + specify False, MUSCLE3 will tell you about it in an error + message and you can add it still. Returns: True iff a final snapshot should be taken by the submodel according @@ -554,7 +560,7 @@ def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool: self._do_reuse, self.__f_init_max_timestamp) def save_final_snapshot(self, message: Message) -> None: - """Save a snapshot before O_F. + """Save a snapshot at the end of the reuse loop. Before saving a snapshot, you should check using :meth:`should_save_final_snapshot` if a snapshot should be saved @@ -566,8 +572,8 @@ def save_final_snapshot(self, message: Message) -> None: submodels of the run (and therefore it is not useful to restart from). It could also lead to a lot of snapshot files clogging your file system. - See also :meth:`save_snapshot` for the variant that may be called inside - of a time-integration loop of the submodel. + See also :meth:`save_snapshot` for the variant that may be called after + each S Operator of the submodel. Args: message: Message object that is saved as snapshot. The data From fea0e74b1305210d1128da60bdccab2d8e25dde7 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 24 Nov 2022 17:14:54 +0100 Subject: [PATCH 094/183] Update doxyfile - `doxygen -u` - disable html output - ignore tests and bindings source folders --- Doxyfile | 242 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 164 insertions(+), 78 deletions(-) diff --git a/Doxyfile b/Doxyfile index f0c512dd..a8b7d7b2 100644 --- a/Doxyfile +++ b/Doxyfile @@ -1,4 +1,4 @@ -# Doxyfile 1.8.13 +# Doxyfile 1.8.17 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. @@ -17,11 +17,11 @@ # Project related configuration options #--------------------------------------------------------------------------- -# This tag specifies the encoding used for all characters in the config file -# that follow. The default is UTF-8 which is also the encoding used for all text -# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv -# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv -# for the list of possible encodings. +# This tag specifies the encoding used for all characters in the configuration +# file that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# https://www.gnu.org/software/libiconv/ for the list of possible encodings. # The default value is: UTF-8. DOXYFILE_ENCODING = UTF-8 @@ -93,6 +93,14 @@ ALLOW_UNICODE_NAMES = NO OUTPUT_LANGUAGE = English +# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all generated output in the proper direction. +# Possible values are: None, LTR, RTL and Context. +# The default value is: None. + +OUTPUT_TEXT_DIRECTION = None + # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member # descriptions after the members that are listed in the file and class # documentation (similar to Javadoc). Set to NO to disable this. @@ -189,6 +197,16 @@ SHORT_NAMES = NO JAVADOC_AUTOBRIEF = YES +# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line +# such as +# /*************** +# as being the beginning of a Javadoc-style comment "banner". If set to NO, the +# Javadoc-style will behave just like regular comments and it will not be +# interpreted by doxygen. +# The default value is: NO. + +JAVADOC_BANNER = NO + # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first # line (until the first dot) of a Qt-style comment as the brief description. If # set to NO, the Qt-style will behave just like regular Qt-style comments (thus @@ -236,7 +254,12 @@ TAB_SIZE = 4 # will allow you to put the command \sideeffect (or @sideeffect) in the # documentation, which will result in a user-defined paragraph with heading # "Side Effects:". You can put \n's in the value part of an alias to insert -# newlines. +# newlines (in the resulting output). You can put ^^ in the value part of an +# alias to insert a newline as if a physical newline was in the original file. +# When you need a literal { or } or , in the value part of an alias you have to +# escape them by means of a backslash (\), this can lead to conflicts with the +# commands \{ and \} for these it is advised to use the version @{ and @} or use +# a double escape (\\{ and \\}) ALIASES = @@ -274,17 +297,26 @@ OPTIMIZE_FOR_FORTRAN = NO OPTIMIZE_OUTPUT_VHDL = NO +# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice +# sources only. Doxygen will then generate output that is more tailored for that +# language. For instance, namespaces will be presented as modules, types will be +# separated into more groups, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_SLICE = NO + # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, and -# language is one of the parsers supported by doxygen: IDL, Java, Javascript, -# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: -# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: -# Fortran. In the later case the parser tries to guess whether the code is fixed -# or free formatted code, this is the default for Fortran type files), VHDL. For -# instance to make doxygen treat .inc files as Fortran files (default is PHP), -# and .f files as C (default is Fortran), use: inc=Fortran f=C. +# language is one of the parsers supported by doxygen: IDL, Java, JavaScript, +# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, +# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: +# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser +# tries to guess whether the code is fixed or free formatted code, this is the +# default for Fortran type files), VHDL, tcl. For instance to make doxygen treat +# .inc files as Fortran files (default is PHP), and .f files as C (default is +# Fortran), use: inc=Fortran f=C. # # Note: For files without extension you can use no_extension as a placeholder. # @@ -295,7 +327,7 @@ EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable -# documentation. See http://daringfireball.net/projects/markdown/ for details. +# documentation. See https://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you can # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in # case of backward compatibilities issues. @@ -307,7 +339,7 @@ MARKDOWN_SUPPORT = YES # to that level are automatically included in the table of contents, even if # they do not have an id attribute. # Note: This feature currently applies only to Markdown headings. -# Minimum value: 0, maximum value: 99, default value: 0. +# Minimum value: 0, maximum value: 99, default value: 5. # This tag requires that the tag MARKDOWN_SUPPORT is set to YES. TOC_INCLUDE_HEADINGS = 0 @@ -337,7 +369,7 @@ BUILTIN_STL_SUPPORT = NO CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: -# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen +# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen # will parse them like normal C++ but will assume all classes use public instead # of private inheritance when no explicit protection keyword is present. # The default value is: NO. @@ -443,6 +475,12 @@ EXTRACT_ALL = NO EXTRACT_PRIVATE = NO +# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual +# methods of a class will be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIV_VIRTUAL = NO + # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal # scope will be included in the documentation. # The default value is: NO. @@ -497,8 +535,8 @@ HIDE_UNDOC_MEMBERS = NO HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend -# (class|struct|union) declarations. If set to NO, these declarations will be -# included in the documentation. +# declarations. If set to NO, these declarations will be included in the +# documentation. # The default value is: NO. HIDE_FRIEND_COMPOUNDS = YES @@ -521,7 +559,7 @@ INTERNAL_DOCS = NO # names in lower-case letters. If set to YES, upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows -# and Mac users are advised to set this option to NO. +# (including Cygwin) ands Mac users are advised to set this option to NO. # The default value is: system dependent. CASE_SENSE_NAMES = YES @@ -708,7 +746,7 @@ LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib # extension is automatically appended if omitted. This requires the bibtex tool -# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. +# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. # For LaTeX the style of the bibliography can be controlled using # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. @@ -753,7 +791,8 @@ WARN_IF_DOC_ERROR = YES # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return # value. If set to NO, doxygen will only warn about wrong or incomplete -# parameter documentation, but not about the absence of documentation. +# parameter documentation, but not about the absence of documentation. If +# EXTRACT_ALL is set to YES then this flag will automatically be disabled. # The default value is: NO. WARN_NO_PARAMDOC = NO @@ -795,7 +834,7 @@ INPUT = libmuscle/cpp/src # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv -# documentation (see: http://www.gnu.org/software/libiconv) for the list of +# documentation (see: https://www.gnu.org/software/libiconv/) for the list of # possible encodings. # The default value is: UTF-8. @@ -812,8 +851,10 @@ INPUT_ENCODING = UTF-8 # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, -# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, -# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf. +# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment), +# *.doc (to be provided as doxygen C comment), *.txt (to be provided as doxygen +# C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f, *.for, *.tcl, *.vhd, +# *.vhdl, *.ucf, *.qsf and *.ice. FILE_PATTERNS = *.c \ *.cc \ @@ -873,7 +914,10 @@ RECURSIVE = YES # Note that relative paths are relative to the directory from which doxygen is # run. -EXCLUDE = libmuscle/cpp/src/muscle_manager_protocol +EXCLUDE = libmuscle/cpp/src/libmuscle/bindings \ + libmuscle/cpp/src/libmuscle/tests \ + libmuscle/cpp/src/ymmsl/bindings \ + libmuscle/cpp/src/ymmsl/tests # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded @@ -1011,7 +1055,7 @@ INLINE_SOURCES = NO STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES then for each documented -# function all documented functions referencing it will be listed. +# entity all documented functions referencing it will be listed. # The default value is: NO. REFERENCED_BY_RELATION = NO @@ -1043,12 +1087,12 @@ SOURCE_TOOLTIPS = YES # If the USE_HTAGS tag is set to YES then the references to source code will # point to the HTML generated by the htags(1) tool instead of doxygen built-in # source browser. The htags tool is part of GNU's global source tagging system -# (see http://www.gnu.org/software/global/global.html). You will need version +# (see https://www.gnu.org/software/global/global.html). You will need version # 4.8.6 or higher. # # To use it do the following: # - Install the latest version of global -# - Enable SOURCE_BROWSER and USE_HTAGS in the config file +# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file # - Make sure the INPUT points to the root of the source tree # - Run doxygen as normal # @@ -1076,7 +1120,7 @@ VERBATIM_HEADERS = YES # rich C++ code for which doxygen's built-in parser lacks the necessary type # information. # Note: The availability of this option depends on whether or not doxygen was -# generated with the -Duse-libclang=ON option for CMake. +# generated with the -Duse_libclang=ON option for CMake. # The default value is: NO. CLANG_ASSISTED_PARSING = NO @@ -1089,6 +1133,16 @@ CLANG_ASSISTED_PARSING = NO CLANG_OPTIONS = +# If clang assisted parsing is enabled you can provide the clang parser with the +# path to the compilation database (see: +# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files +# were built. This is equivalent to specifying the "-p" option to a clang tool, +# such as clang-check. These options will then be passed to the parser. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse_libclang=ON option for CMake. + +CLANG_DATABASE_PATH = + #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index #--------------------------------------------------------------------------- @@ -1122,7 +1176,7 @@ IGNORE_PREFIX = # If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output # The default value is: YES. -GENERATE_HTML = YES +GENERATE_HTML = NO # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of @@ -1207,7 +1261,7 @@ HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to # this color. Hue is specified as an angle on a colorwheel, see -# http://en.wikipedia.org/wiki/Hue for more information. For instance the value +# https://en.wikipedia.org/wiki/Hue for more information. For instance the value # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 # purple, and 360 is red again. # Minimum value: 0, maximum value: 359, default value: 220. @@ -1243,6 +1297,17 @@ HTML_COLORSTYLE_GAMMA = 80 HTML_TIMESTAMP = NO +# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML +# documentation will contain a main index with vertical navigation menus that +# are dynamically created via JavaScript. If disabled, the navigation index will +# consists of multiple levels of tabs that are statically embedded in every HTML +# page. Disable this option to support browsers that do not have JavaScript, +# like the Qt help browser. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_MENUS = YES + # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. @@ -1266,13 +1331,13 @@ HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development -# environment (see: http://developer.apple.com/tools/xcode/), introduced with -# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a +# environment (see: https://developer.apple.com/xcode/), introduced with OSX +# 10.5 (Leopard). To create a documentation set, doxygen will generate a # Makefile in the HTML output directory. Running make will produce the docset in # that directory and running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at -# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html -# for more information. +# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy +# genXcode/_index.html for more information. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. @@ -1311,7 +1376,7 @@ DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop -# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on +# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on # Windows. # # The HTML Help Workshop contains a compiler that can convert all HTML output @@ -1387,7 +1452,7 @@ QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace -# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). +# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1395,7 +1460,7 @@ QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual -# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- +# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual- # folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1404,7 +1469,7 @@ QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom -# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1412,7 +1477,7 @@ QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom -# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- +# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1420,7 +1485,7 @@ QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: -# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_SECT_FILTER_ATTRS = @@ -1513,7 +1578,7 @@ EXT_LINKS_IN_WINDOW = NO FORMULA_FONTSIZE = 10 -# Use the FORMULA_TRANPARENT tag to determine whether or not the images +# Use the FORMULA_TRANSPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are not # supported properly for IE 6.0, but are supported on all modern browsers. # @@ -1524,8 +1589,14 @@ FORMULA_FONTSIZE = 10 FORMULA_TRANSPARENT = YES +# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands +# to create new LaTeX commands to be used in formulas as building blocks. See +# the section "Including formulas" for details. + +FORMULA_MACROFILE = + # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see -# http://www.mathjax.org) which uses client side Javascript for the rendering +# https://www.mathjax.org) which uses client side JavaScript for the rendering # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX # installed or if you want to formulas look prettier in the HTML output. When # enabled you may also need to install MathJax separately and configure the path @@ -1552,8 +1623,8 @@ MATHJAX_FORMAT = HTML-CSS # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of -# MathJax from http://www.mathjax.org before deployment. -# The default value is: http://cdn.mathjax.org/mathjax/latest. +# MathJax from https://www.mathjax.org before deployment. +# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest @@ -1595,7 +1666,7 @@ MATHJAX_CODEFILE = SEARCHENGINE = YES # When the SERVER_BASED_SEARCH tag is enabled the search engine will be -# implemented using a web server instead of a web client using Javascript. There +# implemented using a web server instead of a web client using JavaScript. There # are two flavors of web server based searching depending on the EXTERNAL_SEARCH # setting. When disabled, doxygen will generate a PHP script for searching and # an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing @@ -1614,7 +1685,7 @@ SERVER_BASED_SEARCH = NO # # Doxygen ships with an example indexer (doxyindexer) and search engine # (doxysearch.cgi) which are based on the open source search engine library -# Xapian (see: http://xapian.org/). +# Xapian (see: https://xapian.org/). # # See the section "External Indexing and Searching" for details. # The default value is: NO. @@ -1627,7 +1698,7 @@ EXTERNAL_SEARCH = NO # # Doxygen ships with an example indexer (doxyindexer) and search engine # (doxysearch.cgi) which are based on the open source search engine library -# Xapian (see: http://xapian.org/). See the section "External Indexing and +# Xapian (see: https://xapian.org/). See the section "External Indexing and # Searching" for details. # This tag requires that the tag SEARCHENGINE is set to YES. @@ -1679,21 +1750,35 @@ LATEX_OUTPUT = latex # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be # invoked. # -# Note that when enabling USE_PDFLATEX this option is only used for generating -# bitmaps for formulas in the HTML output, but not in the Makefile that is -# written to the output directory. -# The default file is: latex. +# Note that when not enabling USE_PDFLATEX the default is latex when enabling +# USE_PDFLATEX the default is pdflatex and when in the later case latex is +# chosen this is overwritten by pdflatex. For specific output languages the +# default can have been set differently, this depends on the implementation of +# the output language. # This tag requires that the tag GENERATE_LATEX is set to YES. LATEX_CMD_NAME = latex # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate # index for LaTeX. +# Note: This tag is used in the Makefile / make.bat. +# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file +# (.tex). # The default file is: makeindex. # This tag requires that the tag GENERATE_LATEX is set to YES. MAKEINDEX_CMD_NAME = makeindex +# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to +# generate index for LaTeX. In case there is no backslash (\) as first character +# it will be automatically added in the LaTeX code. +# Note: This tag is used in the generated output file (.tex). +# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat. +# The default value is: makeindex. +# This tag requires that the tag GENERATE_LATEX is set to YES. + +LATEX_MAKEINDEX_CMD = makeindex + # If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX # documents. This may be useful for small projects and may help to save some # trees in general. @@ -1814,7 +1899,7 @@ LATEX_SOURCE_CODE = NO # The LATEX_BIB_STYLE tag can be used to specify the style to use for the # bibliography, e.g. plainnat, or ieeetr. See -# http://en.wikipedia.org/wiki/BibTeX and \cite for more info. +# https://en.wikipedia.org/wiki/BibTeX and \cite for more info. # The default value is: plain. # This tag requires that the tag GENERATE_LATEX is set to YES. @@ -1828,6 +1913,14 @@ LATEX_BIB_STYLE = plain LATEX_TIMESTAMP = NO +# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute) +# path from which the emoji images will be read. If a relative path is entered, +# it will be relative to the LATEX_OUTPUT directory. If left blank the +# LATEX_OUTPUT directory will be used. +# This tag requires that the tag GENERATE_LATEX is set to YES. + +LATEX_EMOJI_DIRECTORY = + #--------------------------------------------------------------------------- # Configuration options related to the RTF output #--------------------------------------------------------------------------- @@ -1867,9 +1960,9 @@ COMPACT_RTF = NO RTF_HYPERLINKS = NO -# Load stylesheet definitions from file. Syntax is similar to doxygen's config -# file, i.e. a series of assignments. You only have to provide replacements, -# missing definitions are set to their default value. +# Load stylesheet definitions from file. Syntax is similar to doxygen's +# configuration file, i.e. a series of assignments. You only have to provide +# replacements, missing definitions are set to their default value. # # See also section "Doxygen usage" for information on how to generate the # default style sheet that doxygen normally uses. @@ -1878,8 +1971,8 @@ RTF_HYPERLINKS = NO RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an RTF document. Syntax is -# similar to doxygen's config file. A template extensions file can be generated -# using doxygen -e rtf extensionFile. +# similar to doxygen's configuration file. A template extensions file can be +# generated using doxygen -e rtf extensionFile. # This tag requires that the tag GENERATE_RTF is set to YES. RTF_EXTENSIONS_FILE = @@ -1965,6 +2058,13 @@ XML_OUTPUT = xml XML_PROGRAMLISTING = YES +# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include +# namespace members in file scope as well, matching the HTML output. +# The default value is: NO. +# This tag requires that the tag GENERATE_XML is set to YES. + +XML_NS_MEMB_FILE_SCOPE = NO + #--------------------------------------------------------------------------- # Configuration options related to the DOCBOOK output #--------------------------------------------------------------------------- @@ -1997,9 +2097,9 @@ DOCBOOK_PROGRAMLISTING = NO #--------------------------------------------------------------------------- # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an -# AutoGen Definitions (see http://autogen.sf.net) file that captures the -# structure of the code including all documentation. Note that this feature is -# still experimental and incomplete at the moment. +# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures +# the structure of the code including all documentation. Note that this feature +# is still experimental and incomplete at the moment. # The default value is: NO. GENERATE_AUTOGEN_DEF = NO @@ -2099,7 +2199,8 @@ INCLUDE_FILE_PATTERNS = # recursively expanded use the := operator instead of the = operator. # This tag requires that the tag ENABLE_PREPROCESSING is set to YES. -PREDEFINED = MUSCLE_ENABLE_MPI DOXYGEN_SHOULD_SKIP_THIS +PREDEFINED = MUSCLE_ENABLE_MPI \ + DOXYGEN_SHOULD_SKIP_THIS # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this # tag can be used to specify a list of macro names that should be expanded. The @@ -2166,12 +2267,6 @@ EXTERNAL_GROUPS = YES EXTERNAL_PAGES = YES -# The PERL_PATH should be the absolute path and name of the perl script -# interpreter (i.e. the result of 'which perl'). -# The default file (with absolute path) is: /usr/bin/perl. - -PERL_PATH = /usr/bin/perl - #--------------------------------------------------------------------------- # Configuration options related to the dot tool #--------------------------------------------------------------------------- @@ -2185,15 +2280,6 @@ PERL_PATH = /usr/bin/perl CLASS_DIAGRAMS = YES -# You can define message sequence charts within doxygen comments using the \msc -# command. Doxygen will then run the mscgen tool (see: -# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the -# documentation. The MSCGEN_PATH tag allows you to specify the directory where -# the mscgen tool resides. If left empty the tool is assumed to be found in the -# default search path. - -MSCGEN_PATH = - # You can include diagrams made with dia in doxygen documentation. Doxygen will # then run dia to produce the diagram and insert it in the documentation. The # DIA_PATH tag allows you to specify the directory where the dia binary resides. From d5c56830dcf4ad8972deac4751f28808a565651b Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 25 Nov 2022 12:58:45 +0100 Subject: [PATCH 095/183] Pin flake8 to <6.0.0 for the time being --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index fcaa0c30..53283fb3 100644 --- a/tox.ini +++ b/tox.ini @@ -5,7 +5,7 @@ skip_missing_interpreters = true [testenv] deps = mypy - flake8 + flake8<6.0.0 pytest pytest-cov git+https://github.com/multiscale/ymmsl-python.git@feature/checkpointing#egg=ymmsl From d6f530fee585499915dea26e16f88719daf304bf Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 25 Nov 2022 15:40:30 +0100 Subject: [PATCH 096/183] Update consistency check for restart from 0 --- integration_test/test_snapshot_macro_micro.py | 93 +++++++++++++------ .../libmuscle/manager/snapshot_registry.py | 36 ++++--- .../manager/test/test_snapshot_registry.py | 37 +++++--- 3 files changed, 114 insertions(+), 52 deletions(-) diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py index f9c14103..ae7e8f27 100644 --- a/integration_test/test_snapshot_macro_micro.py +++ b/integration_test/test_snapshot_macro_micro.py @@ -145,6 +145,17 @@ def stateless_micro(): instance.send('o_f', Message(t_cur, None, i)) +def data_transformer(): + instance = Instance({ + Operator.F_INIT: ['f_i'], + Operator.O_F: ['o_f']}, + stateful=ImplementationState.STATELESS) + + while instance.reuse_instance(): + msg = instance.receive('f_i') + instance.send('o_f', msg) + + @pytest.fixture def base_config(): return load(f"""ymmsl_version: v0.1 @@ -169,11 +180,27 @@ def base_config(): - every: 0.4""") +@pytest.fixture +def config_with_transformer(base_config): + base_config.update(load("""ymmsl_version: v0.1 +model: + name: test_snapshot + components: + transformer1: transformer + transformer2: transformer + conduits: + macro.o_i: transformer1.f_i + transformer1.o_f: micro.f_i + micro.o_f: transformer2.f_i + transformer2.o_f: macro.s""")) + return base_config + + def test_snapshot_macro_micro(tmp_path, base_config): + actors = {'macro': macro, 'micro': micro} run_dir1 = RunDir(tmp_path / 'run1') run_manager_with_actors( - dump(base_config), run_dir1.path, - python_actors={'macro': macro, 'micro': micro}) + dump(base_config), run_dir1.path, python_actors=actors) # Note: sorted only works because we have fewer than 10 snapshots, otherwise # _10 would be sorted right after _1 @@ -183,22 +210,20 @@ def test_snapshot_macro_micro(tmp_path, base_config): assert len(micro_snapshots) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl')) snapshot_docs = list(map(load, snapshots_ymmsl)) - assert 'macro' not in snapshot_docs[0].resume + assert len(snapshot_docs) == 7 + assert snapshot_docs[0].resume['macro'] == macro_snapshots[0] assert snapshot_docs[0].resume['micro'] == micro_snapshots[0] assert snapshot_docs[1].resume['macro'] == macro_snapshots[0] - assert snapshot_docs[1].resume['micro'] == micro_snapshots[0] - assert snapshot_docs[2].resume['macro'] == macro_snapshots[0] - assert snapshot_docs[2].resume['micro'] == micro_snapshots[1] - for i in range(3, 8): - assert snapshot_docs[i].resume['macro'] == macro_snapshots[i - 2] - assert snapshot_docs[i].resume['micro'] == micro_snapshots[i - 2] + assert snapshot_docs[1].resume['micro'] == micro_snapshots[1] + for i in range(2, 7): + assert snapshot_docs[i].resume['macro'] == macro_snapshots[i - 1] + assert snapshot_docs[i].resume['micro'] == micro_snapshots[i - 1] # resume from the snapshots taken at t>=1.2 run_dir2 = RunDir(tmp_path / 'run2') - base_config.update(snapshot_docs[5]) # add resume info + base_config.update(snapshot_docs[4]) # add resume info run_manager_with_actors( - dump(base_config), run_dir2.path, - python_actors={'macro': macro, 'micro': micro}) + dump(base_config), run_dir2.path, python_actors=actors) macro_snapshots = sorted(run_dir2.snapshot_dir().glob('macro*')) assert len(macro_snapshots) == 2 # 1.6, final @@ -213,15 +238,14 @@ def test_snapshot_macro_micro(tmp_path, base_config): base_config.update(snapshot_docs[0]) # add resume info base_config.settings['macro.t_max'] = 0.6 # run shorter run_manager_with_actors( - dump(base_config), run_dir3.path, - python_actors={'macro': macro, 'micro': micro}) + dump(base_config), run_dir3.path, python_actors=actors) def test_snapshot_macro_stateless_micro(tmp_path, base_config): + actors = {'macro': macro, 'micro': stateless_micro} run_dir1 = RunDir(tmp_path / 'run1') run_manager_with_actors( - dump(base_config), run_dir1.path, - python_actors={'macro': macro, 'micro': stateless_micro}) + dump(base_config), run_dir1.path, python_actors=actors) # Note: sorted only works because we have fewer than 10 snapshots, otherwise # _10 would be sorted right after _1 @@ -237,8 +261,7 @@ def test_snapshot_macro_stateless_micro(tmp_path, base_config): run_dir2 = RunDir(tmp_path / 'run2') base_config.update(snapshot_docs[3]) # add resume info run_manager_with_actors( - dump(base_config), run_dir2.path, - python_actors={'macro': macro, 'micro': stateless_micro}) + dump(base_config), run_dir2.path, python_actors=actors) macro_snapshots = sorted(run_dir2.snapshot_dir().glob('macro*')) assert len(macro_snapshots) == 2 # 1.6, final @@ -250,28 +273,23 @@ def test_snapshot_macro_stateless_micro(tmp_path, base_config): def test_snapshot_macro_vector_micro(tmp_path, base_config): base_config.model.components[1].multiplicity = [2] + actors = {'macro': macro_vector, 'micro[0]': micro, 'micro[1]': micro} run_dir1 = RunDir(tmp_path / 'run1') run_manager_with_actors( - dump(base_config), run_dir1.path, - python_actors={'macro': macro_vector, - 'micro[0]': micro, - 'micro[1]': micro}) + dump(base_config), run_dir1.path, python_actors=actors) macro_snapshots = sorted(run_dir1.snapshot_dir().glob('macro*')) assert len(macro_snapshots) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final micro_snapshots = sorted(run_dir1.snapshot_dir().glob('micro*')) assert len(micro_snapshots) == 6 * 2 # 0, 0.4, 0.8, 1.2, 1.6, final snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl')) - assert len(snapshots_ymmsl) == 10 + assert len(snapshots_ymmsl) == 8 run_dir2 = RunDir(tmp_path / 'run2') base_config.update(load(snapshots_ymmsl[-3])) # add resume info run_manager_with_actors( - dump(base_config), run_dir2.path, - python_actors={'macro': macro_vector, - 'micro[0]': micro, - 'micro[1]': micro}) + dump(base_config), run_dir2.path, python_actors=actors) macro_snapshots = sorted(run_dir2.snapshot_dir().glob('macro*')) assert len(macro_snapshots) == 2 # 1.6, final @@ -279,3 +297,24 @@ def test_snapshot_macro_vector_micro(tmp_path, base_config): assert len(micro_snapshots) == 2 * 2 # 1.6, final snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl')) assert len(snapshots_ymmsl) == 2 + + +def test_snapshot_macro_transformer_micro(tmp_path, config_with_transformer): + actors = {'macro': macro, 'micro': micro, 'transformer1': data_transformer, + 'transformer2': data_transformer} + + run_dir1 = RunDir(tmp_path / 'run1') + run_manager_with_actors( + dump(config_with_transformer), run_dir1.path, python_actors=actors) + + snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl')) + assert len(snapshots_ymmsl) == 8 + + # pick one to resume from + run_dir2 = RunDir(tmp_path / 'run2') + config_with_transformer.update(load(snapshots_ymmsl[4])) # add resume info + run_manager_with_actors( + dump(config_with_transformer), run_dir2.path, python_actors=actors) + + snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl')) + assert len(snapshots_ymmsl) == 3 diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py index cbb8bbde..dcf7c3e8 100644 --- a/libmuscle/python/libmuscle/manager/snapshot_registry.py +++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py @@ -24,6 +24,9 @@ _QueueItemType = Optional[Tuple[Reference, SnapshotMetadata]] _T = TypeVar("_T") +# this snapshot is used as a placeholder for restarting from scratch +_NULL_SNAPSHOT = SnapshotMetadata(["Instance start"], 0, 0, None, {}, True, '') + def safe_get(lst: List[_T], index: int, default: _T) -> _T: """Get an item from the list, returning default when it does not exist. @@ -45,40 +48,48 @@ class _ConnectionInfo(Flag): PEER_IS_VECTOR = auto() -def calc_consistency(num1: int, num2: int, first_is_sent: bool) -> bool: +def calc_consistency( + num1: int, num2: int, first_is_sent: bool, num2_is_restart: bool + ) -> bool: """Calculate consistency of message counts. Args: num1: message count of instance 1 num2: message count of instance 2 first_is_sent: True iff instance 1 is sending messages over this conduit + num2_is_restart: True iff the snapshot of num2 is a full restart Returns: True iff the two message counts are consistent """ return (num1 == num2 or # strong num1 + 1 == num2 and first_is_sent or # weak (1 = sent) - num2 + 1 == num1 and not first_is_sent) # weak (2 = sent) + # weak (2 = sent) - only allow if num2 is not a restart + num2 + 1 == num1 and not first_is_sent and not num2_is_restart) def calc_consistency_list( - num1: List[int], num2: List[int], first_is_sent: bool) -> bool: + num1: List[int], num2: List[int], first_is_sent: bool, + num2_is_restart: bool) -> bool: """Calculate consistency of message counts. Args: num1: message count of instance 1 num2: message count of instance 2 first_is_sent: True iff instance 1 is sending messages over this conduit + num2_is_restart: True iff the snapshot of num2 is a full restart Returns: True iff the two message counts are consistent """ if first_is_sent: + allow_weak = True slot_iter = zip_longest(num1, num2, fillvalue=0) else: + allow_weak = not num2_is_restart slot_iter = zip_longest(num2, num1, fillvalue=0) - return all(slot_sent == slot_received or # strong - slot_sent + 1 == slot_received # weak + return all(slot_sent == slot_received or # strong + slot_sent + 1 == slot_received and allow_weak # weak for slot_sent, slot_received in slot_iter) @@ -129,6 +140,7 @@ def do_consistency_check( """ i_snapshot = self.snapshot p_snapshot = peer_node.snapshot + peer_is_restart = p_snapshot is _NULL_SNAPSHOT for connection in connections: i_port, p_port, conn = connection is_sending = bool(conn & _ConnectionInfo.SELF_IS_SENDING) @@ -139,16 +151,16 @@ def do_consistency_check( consistent = calc_consistency( safe_get(i_msg_counts, slot, 0), safe_get(p_msg_counts, 0, 0), - is_sending) + is_sending, peer_is_restart) elif conn & _ConnectionInfo.PEER_IS_VECTOR: slot = int(self.instance[-1]) consistent = calc_consistency( safe_get(i_msg_counts, 0, 0), safe_get(p_msg_counts, slot, 0), - is_sending) + is_sending, peer_is_restart) else: consistent = calc_consistency_list( - i_msg_counts, p_msg_counts, is_sending) + i_msg_counts, p_msg_counts, is_sending, peer_is_restart) if not consistent: # not consistent return False self.consistent_peers.setdefault( @@ -196,10 +208,8 @@ def __init__( self._instances.update(component.instances()) # Create snapshot nodes for starting from scratch - self._null_snapshot = SnapshotMetadata( - ["Instance start"], 0, 0, None, {}, True, '') for instance in self._instances: - self.register_snapshot(instance, self._null_snapshot) + self.register_snapshot(instance, _NULL_SNAPSHOT) def register_snapshot( self, instance: Reference, snapshot: SnapshotMetadata) -> None: @@ -248,7 +258,7 @@ def _add_snapshot( peer_snapshot, self._get_connections(instance, peer)) # finally, check if this snapshotnode is now part of a workflow snapshot - if snapshot is not self._null_snapshot: + if snapshot is not _NULL_SNAPSHOT: self._save_workflow_snapshot(snapshotnode) def _save_workflow_snapshot(self, snapshotnode: SnapshotNode) -> None: @@ -408,7 +418,7 @@ def _generate_snapshot_config( selected_snapshots.sort(key=attrgetter('instance')) resume = {} for node in selected_snapshots: - if node.snapshot is not self._null_snapshot: + if node.snapshot is not _NULL_SNAPSHOT: # Only store resume information when it is an actual snapshot # created by the instance. Otherwise the instance can just be # restarted from the beginning. diff --git a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py index dd6c0c46..6b9838e6 100644 --- a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py +++ b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py @@ -8,8 +8,8 @@ ImplementationState as IState, Reference) from libmuscle.manager.snapshot_registry import ( - SnapshotNode, SnapshotRegistry, calc_consistency, calc_consistency_list, safe_get, - _ConnectionInfo) + SnapshotNode, SnapshotRegistry, calc_consistency, calc_consistency_list, + safe_get, _ConnectionInfo) from libmuscle.manager.topology_store import TopologyStore from libmuscle.snapshot import SnapshotMetadata @@ -78,26 +78,39 @@ def test_safe_get() -> None: def test_calc_consistency() -> None: num_sent = 3 for num_received in [2, 3, 4, 5]: - consistent = num_received in [3, 4] - assert calc_consistency(num_sent, num_received, True) is consistent - assert calc_consistency(num_received, num_sent, False) is consistent + expect = num_received in [3, 4] + assert calc_consistency(num_sent, num_received, True, False) is expect + assert calc_consistency(num_received, num_sent, False, False) is expect num_received = 10 for num_sent in [8, 9, 10, 11]: - consistent = num_sent in [9, 10] - assert calc_consistency(num_sent, num_received, True) is consistent - assert calc_consistency(num_received, num_sent, False) is consistent + expect = num_sent in [9, 10] + assert calc_consistency(num_sent, num_received, True, False) is expect + assert calc_consistency(num_received, num_sent, False, False) is expect + + +def test_calc_consistency_with_restart() -> None: + # Check normal rules + assert calc_consistency(0, 0, True, True) + assert calc_consistency(0, 0, False, True) + assert not calc_consistency(1, 0, True, True) + assert not calc_consistency(1, 0, True, False) + assert calc_consistency(1, 0, False, False) + # Different: num2 == 0 comes from the restarted actor, we do not want a + # resume file to be created in this instance (because an instance further in + # the call chain is ahead of the one that would be restarted): + assert not calc_consistency(1, 0, False, True) def test_calc_consistency_list() -> None: num_sent = [3, 3] for num_received in [[2, 3], [3, 2], [3, 5], [], [4, 4, 0, 0, 2]]: - assert not calc_consistency_list(num_sent, num_received, True) - assert not calc_consistency_list(num_received, num_sent, False) + assert not calc_consistency_list(num_sent, num_received, True, False) + assert not calc_consistency_list(num_received, num_sent, False, False) for num_received in [[3, 3], [3, 4], [4, 3], [4, 4], [3, 3, 1], [4, 4, 0, 0, 0, 1, 0, 1]]: - assert calc_consistency_list(num_sent, num_received, True) - assert calc_consistency_list(num_received, num_sent, False) + assert calc_consistency_list(num_sent, num_received, True, False) + assert calc_consistency_list(num_received, num_sent, False, False) def test_write_ymmsl(tmp_path: Path): From 4d2cfc2ba12fd0a4a26c4ca66766c7b0cfcaca5a Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 28 Nov 2022 10:53:28 +0100 Subject: [PATCH 097/183] Update snapshot_directory: - When a run_dir is provided, each instance has a unique snapshot folder - Snapshot_directory is fixed for a whole run and provided to instances in get_checkpoint_info() instead of the settings - Fallback snapshot directory is the cwd() of the instance (warning generated in muscle manager) --- integration_test/test_snapshot_macro_micro.py | 48 ++++++++++--------- libmuscle/python/libmuscle/instance.py | 11 +---- libmuscle/python/libmuscle/manager/manager.py | 7 +-- .../python/libmuscle/manager/mmp_server.py | 23 +++++++-- libmuscle/python/libmuscle/manager/run_dir.py | 2 +- .../python/libmuscle/manager/test/conftest.py | 10 ++-- .../manager/test/test_mmp_request_handler.py | 19 +++++++- libmuscle/python/libmuscle/mmp_client.py | 17 +++++-- .../python/libmuscle/snapshot_manager.py | 13 ++--- .../python/libmuscle/test/test_instance.py | 15 +++--- .../libmuscle/test/test_snapshot_manager.py | 26 +++++----- 11 files changed, 108 insertions(+), 83 deletions(-) diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py index ae7e8f27..0420df98 100644 --- a/integration_test/test_snapshot_macro_micro.py +++ b/integration_test/test_snapshot_macro_micro.py @@ -204,11 +204,11 @@ def test_snapshot_macro_micro(tmp_path, base_config): # Note: sorted only works because we have fewer than 10 snapshots, otherwise # _10 would be sorted right after _1 - macro_snapshots = sorted(run_dir1.snapshot_dir().glob('macro*')) + macro_snapshots = sorted(run_dir1.snapshot_dir('macro').iterdir()) assert len(macro_snapshots) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final - micro_snapshots = sorted(run_dir1.snapshot_dir().glob('micro*')) + micro_snapshots = sorted(run_dir1.snapshot_dir('micro').iterdir()) assert len(micro_snapshots) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final - snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl')) + snapshots_ymmsl = sorted(run_dir1.snapshot_dir().iterdir()) snapshot_docs = list(map(load, snapshots_ymmsl)) assert len(snapshot_docs) == 7 assert snapshot_docs[0].resume['macro'] == macro_snapshots[0] @@ -225,11 +225,11 @@ def test_snapshot_macro_micro(tmp_path, base_config): run_manager_with_actors( dump(base_config), run_dir2.path, python_actors=actors) - macro_snapshots = sorted(run_dir2.snapshot_dir().glob('macro*')) + macro_snapshots = sorted(run_dir2.snapshot_dir('macro').iterdir()) assert len(macro_snapshots) == 2 # 1.6, final - micro_snapshots = sorted(run_dir2.snapshot_dir().glob('micro*')) + micro_snapshots = sorted(run_dir2.snapshot_dir('micro').iterdir()) assert len(micro_snapshots) == 2 # 1.6, final - snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl')) + snapshots_ymmsl = sorted(run_dir2.snapshot_dir().iterdir()) assert len(snapshots_ymmsl) == 2 # resume from the first workflow snapshot (this restarts macro from scratch) @@ -249,11 +249,11 @@ def test_snapshot_macro_stateless_micro(tmp_path, base_config): # Note: sorted only works because we have fewer than 10 snapshots, otherwise # _10 would be sorted right after _1 - macro_snapshots = sorted(run_dir1.snapshot_dir().glob('macro*')) + macro_snapshots = sorted(run_dir1.snapshot_dir('macro').iterdir()) assert len(macro_snapshots) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final - micro_snapshots = sorted(run_dir1.snapshot_dir().glob('micro*')) + micro_snapshots = sorted(run_dir1.snapshot_dir('micro').iterdir()) assert len(micro_snapshots) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final - snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl')) + snapshots_ymmsl = sorted(run_dir1.snapshot_dir().iterdir()) snapshot_docs = list(map(load, snapshots_ymmsl)) assert len(snapshot_docs) == 6 @@ -263,11 +263,11 @@ def test_snapshot_macro_stateless_micro(tmp_path, base_config): run_manager_with_actors( dump(base_config), run_dir2.path, python_actors=actors) - macro_snapshots = sorted(run_dir2.snapshot_dir().glob('macro*')) + macro_snapshots = sorted(run_dir2.snapshot_dir('macro').iterdir()) assert len(macro_snapshots) == 2 # 1.6, final - micro_snapshots = sorted(run_dir2.snapshot_dir().glob('micro*')) + micro_snapshots = sorted(run_dir2.snapshot_dir('micro').iterdir()) assert len(micro_snapshots) == 3 # 1.2, 1.6, final - snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl')) + snapshots_ymmsl = sorted(run_dir2.snapshot_dir().iterdir()) assert len(snapshots_ymmsl) == 2 @@ -279,11 +279,13 @@ def test_snapshot_macro_vector_micro(tmp_path, base_config): run_manager_with_actors( dump(base_config), run_dir1.path, python_actors=actors) - macro_snapshots = sorted(run_dir1.snapshot_dir().glob('macro*')) + macro_snapshots = sorted(run_dir1.snapshot_dir('macro').iterdir()) assert len(macro_snapshots) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final - micro_snapshots = sorted(run_dir1.snapshot_dir().glob('micro*')) - assert len(micro_snapshots) == 6 * 2 # 0, 0.4, 0.8, 1.2, 1.6, final - snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl')) + micro_snapshots = sorted(run_dir1.snapshot_dir('micro[0]').iterdir()) + assert len(micro_snapshots) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final + micro_snapshots = sorted(run_dir1.snapshot_dir('micro[1]').iterdir()) + assert len(micro_snapshots) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final + snapshots_ymmsl = sorted(run_dir1.snapshot_dir().iterdir()) assert len(snapshots_ymmsl) == 8 run_dir2 = RunDir(tmp_path / 'run2') @@ -291,11 +293,13 @@ def test_snapshot_macro_vector_micro(tmp_path, base_config): run_manager_with_actors( dump(base_config), run_dir2.path, python_actors=actors) - macro_snapshots = sorted(run_dir2.snapshot_dir().glob('macro*')) + macro_snapshots = sorted(run_dir2.snapshot_dir('macro').iterdir()) assert len(macro_snapshots) == 2 # 1.6, final - micro_snapshots = sorted(run_dir2.snapshot_dir().glob('micro*')) - assert len(micro_snapshots) == 2 * 2 # 1.6, final - snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl')) + micro_snapshots = sorted(run_dir2.snapshot_dir('micro[0]').iterdir()) + assert len(micro_snapshots) == 2 # 1.6, final + micro_snapshots = sorted(run_dir2.snapshot_dir('micro[1]').iterdir()) + assert len(micro_snapshots) == 2 # 1.6, final + snapshots_ymmsl = sorted(run_dir2.snapshot_dir().iterdir()) assert len(snapshots_ymmsl) == 2 @@ -307,7 +311,7 @@ def test_snapshot_macro_transformer_micro(tmp_path, config_with_transformer): run_manager_with_actors( dump(config_with_transformer), run_dir1.path, python_actors=actors) - snapshots_ymmsl = sorted(run_dir1.snapshot_dir().glob('snapshot_*.ymmsl')) + snapshots_ymmsl = sorted(run_dir1.snapshot_dir().iterdir()) assert len(snapshots_ymmsl) == 8 # pick one to resume from @@ -316,5 +320,5 @@ def test_snapshot_macro_transformer_micro(tmp_path, config_with_transformer): run_manager_with_actors( dump(config_with_transformer), run_dir2.path, python_actors=actors) - snapshots_ymmsl = sorted(run_dir2.snapshot_dir().glob('snapshot_*.ymmsl')) + snapshots_ymmsl = sorted(run_dir2.snapshot_dir().iterdir()) assert len(snapshots_ymmsl) == 3 diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index 03f3d494..48bb2905 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -1,7 +1,6 @@ from copy import copy import logging import os -from pathlib import Path import sys from typing import cast, Dict, List, Optional, Tuple, overload # TODO: import from typing module when dropping support for python 3.7 @@ -150,16 +149,8 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool: do_reuse = self.__check_reuse_instance(apply_overlay) self._do_reuse = None - # Note: muscle_snapshot_directory setting is provided by muscle_manager - # when checkpointing is enabled for this run. When checkpointing is not - # enabled, it might not exist and a KeyError is raised. - try: - snapshot_dir = self.get_setting('muscle_snapshot_directory', 'str') - snapshot_path = Path(snapshot_dir) - except KeyError: - snapshot_path = None self._snapshot_manager.reuse_instance( - snapshot_path, do_reuse, self.__f_init_max_timestamp) + do_reuse, self.__f_init_max_timestamp) if not do_reuse: self.__close_ports() diff --git a/libmuscle/python/libmuscle/manager/manager.py b/libmuscle/python/libmuscle/manager/manager.py index d96842a7..0ec1da3b 100644 --- a/libmuscle/python/libmuscle/manager/manager.py +++ b/libmuscle/python/libmuscle/manager/manager.py @@ -57,11 +57,6 @@ def __init__( self._configuration, self._run_dir.path / 'configuration.ymmsl') - # TODO: decide if this should be a setting or part of checkpoint_info - # TODO: separate folder per intance - self._configuration.settings.setdefault( - 'muscle_snapshot_directory', str(snapshot_dir)) - self._instance_manager = None # type: Optional[InstanceManager] try: configuration = self._configuration.as_configuration() @@ -80,7 +75,7 @@ def __init__( self._server = MMPServer( self._logger, self._configuration, self._instance_registry, self._topology_store, - self._snapshot_registry) + self._snapshot_registry, run_dir) if self._instance_manager: self._instance_manager.set_manager_location( diff --git a/libmuscle/python/libmuscle/manager/mmp_server.py b/libmuscle/python/libmuscle/manager/mmp_server.py index 9382d0eb..90617fae 100644 --- a/libmuscle/python/libmuscle/manager/mmp_server.py +++ b/libmuscle/python/libmuscle/manager/mmp_server.py @@ -1,7 +1,7 @@ from datetime import datetime, timezone import errno import logging -from typing import Any, Dict, cast, List +from typing import Any, Dict, cast, List, Optional import msgpack from ymmsl import ( @@ -12,6 +12,7 @@ from libmuscle.manager.instance_registry import ( AlreadyRegistered, InstanceRegistry) from libmuscle.manager.logger import Logger +from libmuscle.manager.run_dir import RunDir from libmuscle.manager.snapshot_registry import SnapshotRegistry from libmuscle.manager.topology_store import TopologyStore from libmuscle.mcp.protocol import RequestType, ResponseType @@ -56,7 +57,9 @@ def __init__( configuration: PartialConfiguration, instance_registry: InstanceRegistry, topology_store: TopologyStore, - snapshot_registry: SnapshotRegistry): + snapshot_registry: SnapshotRegistry, + run_dir: Optional[RunDir] + ) -> None: """Create an MMPRequestHandler. Args: @@ -70,6 +73,7 @@ def __init__( self._instance_registry = instance_registry self._topology_store = topology_store self._snapshot_registry = snapshot_registry + self._run_dir = run_dir self._reference_time = datetime.now(timezone.utc) self._reference_timestamp = self._reference_time.timestamp() @@ -286,15 +290,23 @@ def _get_checkpoint_info(self, instance_id: str) -> Any: wallclock time of the start of the workflow. checkpoints (dict): Dictionary encdoing a ymmsl.Checkpoints object. resume_path (Optional[str]): Checkpoint filename to resume from. + snapshot_directory (Optional[str]): Directory to store instance + snapshots. """ instance = Reference(instance_id) resume = None if instance in self._configuration.resume: resume = str(self._configuration.resume[instance]) + + snapshot_directory = None + if self._run_dir is not None: + snapshot_directory = str(self._run_dir.snapshot_dir(instance)) + return [ResponseType.SUCCESS.value, self._reference_timestamp, encode_checkpoints(self._configuration.checkpoints), - resume] + resume, + snapshot_directory] class MMPServer: @@ -310,7 +322,8 @@ def __init__( configuration: PartialConfiguration, instance_registry: InstanceRegistry, topology_store: TopologyStore, - snapshot_registry: SnapshotRegistry + snapshot_registry: SnapshotRegistry, + run_dir: Optional[RunDir] ) -> None: """Create an MMPServer. @@ -329,7 +342,7 @@ def __init__( """ self._handler = MMPRequestHandler( logger, configuration, instance_registry, topology_store, - snapshot_registry) + snapshot_registry, run_dir) try: self._server = TcpTransportServer(self._handler, 9000) except OSError as e: diff --git a/libmuscle/python/libmuscle/manager/run_dir.py b/libmuscle/python/libmuscle/manager/run_dir.py index 6a50c2fe..186d32e8 100644 --- a/libmuscle/python/libmuscle/manager/run_dir.py +++ b/libmuscle/python/libmuscle/manager/run_dir.py @@ -75,5 +75,5 @@ def snapshot_dir(self, name: Optional[Reference] = None) -> Path: path = self.path / 'snapshots' else: path = self.instance_dir(name) / 'snapshots' - path.mkdir(exist_ok=True) + path.mkdir(parents=True, exist_ok=True) return path diff --git a/libmuscle/python/libmuscle/manager/test/conftest.py b/libmuscle/python/libmuscle/manager/test/conftest.py index 24772bda..992a3950 100644 --- a/libmuscle/python/libmuscle/manager/test/conftest.py +++ b/libmuscle/python/libmuscle/manager/test/conftest.py @@ -6,6 +6,7 @@ from libmuscle.manager.instance_registry import InstanceRegistry from libmuscle.manager.logger import Logger from libmuscle.manager.mmp_server import MMPRequestHandler +from libmuscle.manager.run_dir import RunDir from libmuscle.manager.snapshot_registry import SnapshotRegistry from libmuscle.manager.topology_store import TopologyStore @@ -53,7 +54,7 @@ def mmp_request_handler( snapshot_registry): return MMPRequestHandler( logger, mmp_configuration, instance_registry, topology_store, - snapshot_registry) + snapshot_registry, None) @pytest.fixture @@ -73,7 +74,7 @@ def registered_mmp_request_handler( snapshot_registry): return MMPRequestHandler( logger, mmp_configuration, loaded_instance_registry, topology_store, - snapshot_registry) + snapshot_registry, None) @pytest.fixture @@ -126,7 +127,8 @@ def loaded_instance_registry2(): @pytest.fixture def registered_mmp_request_handler2( logger, mmp_configuration, loaded_instance_registry2, topology_store2, - snapshot_registry2): + snapshot_registry2, tmp_path): return MMPRequestHandler( logger, mmp_configuration, - loaded_instance_registry2, topology_store2, snapshot_registry2) + loaded_instance_registry2, topology_store2, snapshot_registry2, + RunDir(tmp_path)) diff --git a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py index 89de4068..876ae197 100644 --- a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py +++ b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py @@ -17,7 +17,7 @@ def test_create_servicer(logger, mmp_configuration, instance_registry, topology_store, snapshot_registry): MMPRequestHandler( logger, mmp_configuration, instance_registry, topology_store, - snapshot_registry) + snapshot_registry, None) def test_log_message(mmp_request_handler, caplog): @@ -109,7 +109,7 @@ def test_get_checkpoint_info(mmp_configuration, mmp_request_handler): decoded_result = msgpack.unpackb(result, raw=False) assert decoded_result[0] == ResponseType.SUCCESS.value - timestamp, checkpoints, resume = decoded_result[1:] + timestamp, checkpoints, resume, snapshot_directory = decoded_result[1:] ref_time = datetime.fromtimestamp(timestamp, tz=timezone.utc) assert ref_time == mmp_request_handler._reference_time @@ -126,6 +126,21 @@ def test_get_checkpoint_info(mmp_configuration, mmp_request_handler): assert resume is not None assert Path(resume) == resume_path + assert snapshot_directory is None + + +def test_get_checkpoint_info2(registered_mmp_request_handler2, tmp_path): + request = [RequestType.GET_CHECKPOINT_INFO.value, 'test_instance'] + encoded_request = msgpack.packb(request, use_bin_type=True) + + result = registered_mmp_request_handler2.handle_request(encoded_request) + decoded_result = msgpack.unpackb(result, raw=False) + + assert decoded_result[0] == ResponseType.SUCCESS.value + snapshot_directory = decoded_result[4] + assert snapshot_directory == ( + str(tmp_path) + '/instances/test_instance/snapshots') + def test_double_register_instance(mmp_request_handler): request = [ diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py index 37effdca..f40ea48b 100644 --- a/libmuscle/python/libmuscle/mmp_client.py +++ b/libmuscle/python/libmuscle/mmp_client.py @@ -22,6 +22,9 @@ PEER_INTERVAL_MIN = 5.0 PEER_INTERVAL_MAX = 10.0 +_CheckpointInfoType = Tuple[ + datetime, Checkpoints, Optional[Path], Optional[Path]] + def encode_operator(op: Operator) -> str: """Convert an Operator to a MsgPack-compatible value.""" @@ -63,8 +66,9 @@ def decode_checkpoint_rule(rule: Dict[str, Any]) -> CheckpointRule: def decode_checkpoint_info( reference_timestamp: float, checkpoints_dict: Dict[str, Any], - resume: Optional[str] - ) -> Tuple[datetime, Checkpoints, Optional[Path]]: + resume: Optional[str], + snapshot_dir: Optional[str] + ) -> _CheckpointInfoType: """Decode checkpoint info from a MsgPack-compatible value. Args: @@ -72,11 +76,13 @@ def decode_checkpoint_info( wallclock_time = 0 checkpoints_dict: dictionary of checkpoint definitions resume: optional string indicating resume path + snapshot_dir: optional string indicating path to store snapshots in Returns: wallclock_time_reference: UTC time where wallclock_time = 0 checkpoints: checkpoint configuration resume: path to the resume snapshot + snapshot_dir: optional path to store snapshots in """ ref_time = datetime.fromtimestamp(reference_timestamp, tz=timezone.utc) checkpoints = Checkpoints( @@ -86,7 +92,8 @@ def decode_checkpoint_info( simulation_time=[decode_checkpoint_rule(rule) for rule in checkpoints_dict["simulation_time"]]) resume_path = None if resume is None else Path(resume) - return (ref_time, checkpoints, resume_path) + snapshot_path = None if snapshot_dir is None else Path(snapshot_dir) + return (ref_time, checkpoints, resume_path, snapshot_path) class MMPClient(): @@ -162,14 +169,14 @@ def get_settings(self) -> Settings: response = self._call_manager(request) return Settings(response[1]) - def get_checkpoint_info(self, name: Reference - ) -> Tuple[datetime, Checkpoints, Optional[Path]]: + def get_checkpoint_info(self, name: Reference) -> _CheckpointInfoType: """Get the checkpoint info from the manager. Returns: wallclock_time_reference: UTC time where wallclock_time = 0 checkpoints: checkpoint configuration resume: path to the resume snapshot + snapshot_directory: path to store snapshots """ request = [RequestType.GET_CHECKPOINT_INFO.value, str(name)] response = self._call_manager(request) diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py index 54059375..0678c96c 100644 --- a/libmuscle/python/libmuscle/snapshot_manager.py +++ b/libmuscle/python/libmuscle/snapshot_manager.py @@ -50,7 +50,6 @@ def __init__(self, self._first_reuse = True self._trigger_manager = TriggerManager() self._resume_from_snapshot = None # type: Optional[Snapshot] - self._snapshot_directory = None # type: Optional[Path] self._next_snapshot_num = 1 def get_checkpoint_info(self) -> None: @@ -62,7 +61,8 @@ def get_checkpoint_info(self) -> None: def _set_checkpoint_info(self, utc_reference: datetime, checkpoints: Checkpoints, - resume: Optional[Path]) -> None: + resume: Optional[Path], + snapshot_directory: Optional[Path]) -> None: """Apply checkpoint info received from the manager. Args: @@ -71,6 +71,7 @@ def _set_checkpoint_info(self, resume: previous snapshot to resume from (or None if not resuming) """ self._trigger_manager.set_checkpoint_info(utc_reference, checkpoints) + self._snapshot_directory = snapshot_directory or Path.cwd() if resume is not None: snapshot = self.load_snapshot_from_file(resume) if snapshot.message is not None: @@ -82,7 +83,7 @@ def _set_checkpoint_info(self, self._communicator.restore_message_counts( snapshot.port_message_counts) - def reuse_instance(self, snapshot_directory: Optional[Path], + def reuse_instance(self, do_reuse: bool, f_init_max_timestamp: Optional[float] ) -> None: """Callback on Instance.reuse_instance @@ -106,8 +107,6 @@ def reuse_instance(self, snapshot_directory: Optional[Path], self._trigger_manager.reuse_instance() - self._snapshot_directory = snapshot_directory - if self._first_reuse: self._first_reuse = False else: @@ -246,10 +245,6 @@ def __store_snapshot(self, snapshot: Snapshot) -> Path: Path where the snapshot is stored """ _logger.debug(f'Saving snapshot to {self._snapshot_directory}') - if self._snapshot_directory is None: - raise RuntimeError('Unknown snapshot directory. Did you try to' - ' save a snapshot before entering the reuse' - ' loop?') for _ in range(_MAX_FILE_EXISTS_CHECK): # Expectation is that muscle_snapshot_directory is empty initially # and we succeed in the first loop. Still wrapping in a for-loop diff --git a/libmuscle/python/libmuscle/test/test_instance.py b/libmuscle/python/libmuscle/test/test_instance.py index 54044a00..17e3e3e0 100644 --- a/libmuscle/python/libmuscle/test/test_instance.py +++ b/libmuscle/python/libmuscle/test/test_instance.py @@ -37,7 +37,7 @@ def sys_argv_instance() -> Generator[None, None, None]: @pytest.fixture -def instance(sys_argv_instance): +def instance(sys_argv_instance, tmp_path): with patch('libmuscle.instance.MMPClient') as mmp_client, \ patch('libmuscle.instance.Communicator') as comm_type: communicator = MagicMock() @@ -49,7 +49,8 @@ def instance(sys_argv_instance): mmp_client_object = MagicMock() mmp_client_object.request_peers.return_value = (None, None, None) - checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None) + checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None, + tmp_path) mmp_client_object.get_checkpoint_info.return_value = checkpoint_info mmp_client.return_value = mmp_client_object @@ -62,12 +63,13 @@ def instance(sys_argv_instance): @pytest.fixture -def instance2(sys_argv_instance): +def instance2(sys_argv_instance, tmp_path): with patch('libmuscle.instance.MMPClient') as mmp_client, \ patch('libmuscle.instance.Communicator'): mmp_client_object = MagicMock() mmp_client_object.request_peers.return_value = (None, None, None) - checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None) + checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None, + tmp_path) mmp_client_object.get_checkpoint_info.return_value = checkpoint_info mmp_client.return_value = mmp_client_object instance = Instance({ @@ -77,12 +79,13 @@ def instance2(sys_argv_instance): def test_create_instance( - sys_argv_instance, log_file_in_tmpdir, sys_argv_manager): + sys_argv_instance, log_file_in_tmpdir, sys_argv_manager, tmp_path): with patch('libmuscle.instance.MMPClient') as mmp_client, \ patch('libmuscle.instance.Communicator') as comm_type: mmp_client_object = MagicMock() mmp_client_object.request_peers.return_value = (None, None, None) - checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None) + checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None, + tmp_path) mmp_client_object.get_checkpoint_info.return_value = checkpoint_info mmp_client.return_value = mmp_client_object ports = { diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py index 16f81ce3..6325cb0d 100644 --- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py +++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py @@ -22,10 +22,10 @@ def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path ImplementationState.STATEFUL) snapshot_manager._set_checkpoint_info( - datetime.now(timezone.utc), Checkpoints(), None) + datetime.now(timezone.utc), Checkpoints(), None, tmp_path) assert not snapshot_manager.resuming() - snapshot_manager.reuse_instance(tmp_path, True, None) + snapshot_manager.reuse_instance(True, None) assert not snapshot_manager.resuming() assert not snapshot_manager.should_save_snapshot(1) assert not snapshot_manager.should_save_snapshot(5000) @@ -49,10 +49,10 @@ def test_save_load_snapshot(tmp_path: Path) -> None: checkpoints = Checkpoints(simulation_time=[CheckpointRangeRule(every=1)]) snapshot_manager._set_checkpoint_info( - datetime.now(timezone.utc), checkpoints, None) + datetime.now(timezone.utc), checkpoints, None, tmp_path) assert not snapshot_manager.resuming() - snapshot_manager.reuse_instance(tmp_path, True, None) + snapshot_manager.reuse_instance(True, None) with pytest.raises(RuntimeError): snapshot_manager.load_snapshot() @@ -79,11 +79,11 @@ def test_save_load_snapshot(tmp_path: Path) -> None: instance_id, manager, communicator, ImplementationState.STATEFUL) snapshot_manager2._set_checkpoint_info( - datetime.now(timezone.utc), checkpoints, snapshot_path) + datetime.now(timezone.utc), checkpoints, snapshot_path, tmp_path) communicator.restore_message_counts.assert_called_with(port_message_counts) assert snapshot_manager2.resuming() - snapshot_manager2.reuse_instance(tmp_path, True, None) + snapshot_manager2.reuse_instance(True, None) assert snapshot_manager2.resuming() msg = snapshot_manager2.load_snapshot() assert msg.timestamp == 0.2 @@ -109,7 +109,7 @@ def test_save_load_snapshot(tmp_path: Path) -> None: assert snapshot_path.name == 'test-1_2.pack' assert snapshot_manager2.resuming() - snapshot_manager2.reuse_instance(tmp_path, True, None) + snapshot_manager2.reuse_instance(True, None) assert not snapshot_manager2.resuming() @@ -125,11 +125,11 @@ def test_save_load_implicit_snapshot(tmp_path: Path) -> None: checkpoints = Checkpoints(simulation_time=[CheckpointRangeRule(every=1)]) snapshot_manager._set_checkpoint_info( - datetime.now(timezone.utc), checkpoints, None) + datetime.now(timezone.utc), checkpoints, None, tmp_path) assert not snapshot_manager.resuming() - snapshot_manager.reuse_instance(tmp_path, True, None) - snapshot_manager.reuse_instance(tmp_path, True, 1.5) + snapshot_manager.reuse_instance(True, None) + snapshot_manager.reuse_instance(True, 1.5) manager.submit_snapshot_metadata.assert_called_once() instance, metadata = manager.submit_snapshot_metadata.call_args[0] assert instance == instance_id @@ -141,11 +141,11 @@ def test_save_load_implicit_snapshot(tmp_path: Path) -> None: instance_id, manager, communicator, ImplementationState.STATELESS) snapshot_manager2._set_checkpoint_info( - datetime.now(timezone.utc), checkpoints, snapshot_path) + datetime.now(timezone.utc), checkpoints, snapshot_path, tmp_path) communicator.restore_message_counts.assert_called_with(port_message_counts) assert not snapshot_manager2.resuming() - snapshot_manager2.reuse_instance(tmp_path, True, 1.5) + snapshot_manager2.reuse_instance(True, 1.5) assert not snapshot_manager2.resuming() - snapshot_manager2.reuse_instance(tmp_path, True, 2.5) + snapshot_manager2.reuse_instance(True, 2.5) manager.submit_snapshot_metadata.assert_called_once() From fa648e8b68c05c94e71f92c75c1e73e75408adad Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 28 Nov 2022 11:36:05 +0100 Subject: [PATCH 098/183] Bugfixes, new test, create snapshot at resume - Add test_snapshot_dispatch which is a pure serial workflow - Each actor will also store the snapshot it is resumed from --- integration_test/conftest.py | 9 +- integration_test/test_snapshot_dispatch.py | 124 ++++++++++++++++++ integration_test/test_snapshot_macro_micro.py | 69 ++++------ .../python/libmuscle/checkpoint_triggers.py | 4 +- libmuscle/python/libmuscle/instance.py | 5 +- libmuscle/python/libmuscle/mmp_client.py | 2 +- .../python/libmuscle/snapshot_manager.py | 6 +- .../libmuscle/test/test_snapshot_manager.py | 4 +- 8 files changed, 172 insertions(+), 51 deletions(-) create mode 100644 integration_test/test_snapshot_dispatch.py diff --git a/integration_test/conftest.py b/integration_test/conftest.py index ad59842a..a6c70b1e 100644 --- a/integration_test/conftest.py +++ b/integration_test/conftest.py @@ -26,6 +26,12 @@ def yatiml_log_warning(): yatiml.logger.setLevel(logging.WARNING) +def ls_snapshots(run_dir, instance=None): + """List all snapshots of the instance or workflow""" + return sorted(run_dir.snapshot_dir(instance).iterdir(), + key=lambda path: tuple(map(int, path.stem.split("_")[1:]))) + + def start_mmp_server(control_pipe, ymmsl_doc, run_dir): control_pipe[0].close() manager = Manager(ymmsl_doc, run_dir) @@ -108,7 +114,8 @@ def run_manager_with_actors( for instance_name, callable in python_actors.items(): proc = mp.Process( target=_python_wrapper, - args=(instance_name, env['MUSCLE_MANAGER'], callable)) + args=(instance_name, env['MUSCLE_MANAGER'], callable), + name=instance_name) proc.start() python_processes.append(proc) diff --git a/integration_test/test_snapshot_dispatch.py b/integration_test/test_snapshot_dispatch.py new file mode 100644 index 00000000..2b9d3b57 --- /dev/null +++ b/integration_test/test_snapshot_dispatch.py @@ -0,0 +1,124 @@ +import pytest +from ymmsl import ImplementationState, Operator, load, dump + +from libmuscle import Instance, Message +from libmuscle.manager.run_dir import RunDir + +from .conftest import run_manager_with_actors, ls_snapshots + + +_LOG_LEVEL = 'INFO' # set to DEBUG for additional debug info + + +def component(): + instance = Instance({ + Operator.F_INIT: ['f_i'], + Operator.O_F: ['o_f']}) + + while instance.reuse_instance(): + dt = instance.get_setting('dt', 'float') + t_max = instance.get_setting('t_max', 'float') + + if instance.resuming(): + msg = instance.load_snapshot() + t_cur = msg.timestamp + i, t_stop = msg.data + + if instance.should_init(): + msg = instance.receive('f_i', default=Message(0, None, 0)) + t_cur = msg.timestamp + i = msg.data + t_stop = t_cur + t_max + + while t_cur < t_stop: + # faux time-integration for testing snapshots + t_cur += dt + + if instance.should_save_snapshot(t_cur): + instance.save_snapshot(Message(t_cur, None, [i, t_stop])) + + instance.send('o_f', Message(t_cur, None, i)) + + if instance.should_save_final_snapshot(): + instance.save_final_snapshot(Message(t_cur, None, [i, t_stop])) + + +def stateless_component(): + instance = Instance({ + Operator.F_INIT: ['f_i'], + Operator.O_F: ['o_f']}, + stateful=ImplementationState.STATELESS) + + while instance.reuse_instance(): + dt = instance.get_setting('dt', 'float') + t_max = instance.get_setting('t_max', 'float') + + msg = instance.receive('f_i', default=Message(0, None, 0)) + t_cur = msg.timestamp + i = msg.data + t_stop = t_cur + t_max + + while t_cur < t_stop: + # faux time-integration for testing snapshots + t_cur += dt + + instance.send('o_f', Message(t_cur, None, i)) + + +@pytest.fixture +def dispatch_config(): + return load(f"""ymmsl_version: v0.1 +model: + name: test_snapshot + components: + comp1: component + comp2: component + comp3: component + comp4: component + comp5: component + conduits: + comp1.o_f: comp2.f_i + comp2.o_f: comp3.f_i + comp3.o_f: comp4.f_i + comp4.o_f: comp5.f_i +settings: + dt: 0.1234 + t_max: 2.0 + muscle_remote_log_level: {_LOG_LEVEL} +checkpoints: + at_end: true + simulation_time: + - every: 2.5 + - at: + - 2.3 + - 2.8""") + + +def test_snapshot_dispatch(tmp_path, dispatch_config): + actors = {f'comp{i + 1}': component for i in range(5)} + run_dir1 = RunDir(tmp_path / 'run1') + run_manager_with_actors( + dump(dispatch_config), run_dir1.path, python_actors=actors) + + assert len(ls_snapshots(run_dir1, 'comp1')) == 2 # t=0, at_end + assert len(ls_snapshots(run_dir1, 'comp2')) == 5 # t=0, 2.5, 2.3, 2.8, at_end + assert len(ls_snapshots(run_dir1, 'comp3')) == 3 # t=2.5, 5, at_end + assert len(ls_snapshots(run_dir1, 'comp4')) == 3 # t=5, 7.5, at_end + assert len(ls_snapshots(run_dir1, 'comp5')) == 3 # t=7.5, 10, at_end + + snapshots_ymmsl = ls_snapshots(run_dir1) + snapshot_docs = list(map(load, snapshots_ymmsl)) + assert len(snapshot_docs) == 16 + + # resume from the snapshots taken at t>=2.3 + run_dir2 = RunDir(tmp_path / 'run2') + dispatch_config.update(snapshot_docs[3]) # add resume info + run_manager_with_actors( + dump(dispatch_config), run_dir2.path, python_actors=actors) + + assert len(ls_snapshots(run_dir2, 'comp1')) == 1 # resume + assert len(ls_snapshots(run_dir2, 'comp2')) == 4 # resume, t=2.5, 2.8, at_end + assert len(ls_snapshots(run_dir2, 'comp3')) == 3 # t=2.5, 5, at_end + assert len(ls_snapshots(run_dir2, 'comp4')) == 3 # t=5, 7.5, at_end + assert len(ls_snapshots(run_dir2, 'comp5')) == 3 # t=7.5, 10, at_end + assert len(ls_snapshots(run_dir2)) == 13 diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py index 0420df98..f8b11cb4 100644 --- a/integration_test/test_snapshot_macro_micro.py +++ b/integration_test/test_snapshot_macro_micro.py @@ -1,11 +1,11 @@ -from .conftest import run_manager_with_actors - import pytest from ymmsl import ImplementationState, Operator, load, dump from libmuscle import Instance, Message from libmuscle.manager.run_dir import RunDir +from .conftest import run_manager_with_actors, ls_snapshots + _LOG_LEVEL = 'INFO' # set to DEBUG for additional debug info @@ -202,13 +202,11 @@ def test_snapshot_macro_micro(tmp_path, base_config): run_manager_with_actors( dump(base_config), run_dir1.path, python_actors=actors) - # Note: sorted only works because we have fewer than 10 snapshots, otherwise - # _10 would be sorted right after _1 - macro_snapshots = sorted(run_dir1.snapshot_dir('macro').iterdir()) + macro_snapshots = ls_snapshots(run_dir1, 'macro') assert len(macro_snapshots) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final - micro_snapshots = sorted(run_dir1.snapshot_dir('micro').iterdir()) + micro_snapshots = ls_snapshots(run_dir1, 'micro') assert len(micro_snapshots) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final - snapshots_ymmsl = sorted(run_dir1.snapshot_dir().iterdir()) + snapshots_ymmsl = ls_snapshots(run_dir1) snapshot_docs = list(map(load, snapshots_ymmsl)) assert len(snapshot_docs) == 7 assert snapshot_docs[0].resume['macro'] == macro_snapshots[0] @@ -225,12 +223,9 @@ def test_snapshot_macro_micro(tmp_path, base_config): run_manager_with_actors( dump(base_config), run_dir2.path, python_actors=actors) - macro_snapshots = sorted(run_dir2.snapshot_dir('macro').iterdir()) - assert len(macro_snapshots) == 2 # 1.6, final - micro_snapshots = sorted(run_dir2.snapshot_dir('micro').iterdir()) - assert len(micro_snapshots) == 2 # 1.6, final - snapshots_ymmsl = sorted(run_dir2.snapshot_dir().iterdir()) - assert len(snapshots_ymmsl) == 2 + assert len(ls_snapshots(run_dir2, 'macro')) == 3 # resume, 1.6, final + assert len(ls_snapshots(run_dir2, 'micro')) == 3 # resume, 1.6, final + assert len(ls_snapshots(run_dir2)) == 3 # resume from the first workflow snapshot (this restarts macro from scratch) run_dir3 = RunDir(tmp_path / 'run3') @@ -247,13 +242,9 @@ def test_snapshot_macro_stateless_micro(tmp_path, base_config): run_manager_with_actors( dump(base_config), run_dir1.path, python_actors=actors) - # Note: sorted only works because we have fewer than 10 snapshots, otherwise - # _10 would be sorted right after _1 - macro_snapshots = sorted(run_dir1.snapshot_dir('macro').iterdir()) - assert len(macro_snapshots) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final - micro_snapshots = sorted(run_dir1.snapshot_dir('micro').iterdir()) - assert len(micro_snapshots) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final - snapshots_ymmsl = sorted(run_dir1.snapshot_dir().iterdir()) + assert len(ls_snapshots(run_dir1, 'macro')) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final + assert len(ls_snapshots(run_dir1, 'micro')) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final + snapshots_ymmsl = ls_snapshots(run_dir1) snapshot_docs = list(map(load, snapshots_ymmsl)) assert len(snapshot_docs) == 6 @@ -263,12 +254,9 @@ def test_snapshot_macro_stateless_micro(tmp_path, base_config): run_manager_with_actors( dump(base_config), run_dir2.path, python_actors=actors) - macro_snapshots = sorted(run_dir2.snapshot_dir('macro').iterdir()) - assert len(macro_snapshots) == 2 # 1.6, final - micro_snapshots = sorted(run_dir2.snapshot_dir('micro').iterdir()) - assert len(micro_snapshots) == 3 # 1.2, 1.6, final - snapshots_ymmsl = sorted(run_dir2.snapshot_dir().iterdir()) - assert len(snapshots_ymmsl) == 2 + assert len(ls_snapshots(run_dir2, 'macro')) == 3 # resume, 1.6, final + assert len(ls_snapshots(run_dir2, 'micro')) == 4 # resume, 1.2, 1.6, final + assert len(ls_snapshots(run_dir2)) == 3 def test_snapshot_macro_vector_micro(tmp_path, base_config): @@ -279,13 +267,10 @@ def test_snapshot_macro_vector_micro(tmp_path, base_config): run_manager_with_actors( dump(base_config), run_dir1.path, python_actors=actors) - macro_snapshots = sorted(run_dir1.snapshot_dir('macro').iterdir()) - assert len(macro_snapshots) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final - micro_snapshots = sorted(run_dir1.snapshot_dir('micro[0]').iterdir()) - assert len(micro_snapshots) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final - micro_snapshots = sorted(run_dir1.snapshot_dir('micro[1]').iterdir()) - assert len(micro_snapshots) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final - snapshots_ymmsl = sorted(run_dir1.snapshot_dir().iterdir()) + assert len(ls_snapshots(run_dir1, 'macro')) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final + assert len(ls_snapshots(run_dir1, 'micro[0]')) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final + assert len(ls_snapshots(run_dir1, 'micro[1]')) == 6 # 0, 0.4, 0.8, 1.2, 1.6, final + snapshots_ymmsl = ls_snapshots(run_dir1) assert len(snapshots_ymmsl) == 8 run_dir2 = RunDir(tmp_path / 'run2') @@ -293,14 +278,10 @@ def test_snapshot_macro_vector_micro(tmp_path, base_config): run_manager_with_actors( dump(base_config), run_dir2.path, python_actors=actors) - macro_snapshots = sorted(run_dir2.snapshot_dir('macro').iterdir()) - assert len(macro_snapshots) == 2 # 1.6, final - micro_snapshots = sorted(run_dir2.snapshot_dir('micro[0]').iterdir()) - assert len(micro_snapshots) == 2 # 1.6, final - micro_snapshots = sorted(run_dir2.snapshot_dir('micro[1]').iterdir()) - assert len(micro_snapshots) == 2 # 1.6, final - snapshots_ymmsl = sorted(run_dir2.snapshot_dir().iterdir()) - assert len(snapshots_ymmsl) == 2 + assert len(ls_snapshots(run_dir2, 'macro')) == 3 # resume, 1.6, final + assert len(ls_snapshots(run_dir2, 'micro[0]')) == 3 # resume, 1.6, final + assert len(ls_snapshots(run_dir2, 'micro[1]')) == 3 # resume, 1.6, final + assert len(ls_snapshots(run_dir2)) == 3 def test_snapshot_macro_transformer_micro(tmp_path, config_with_transformer): @@ -311,7 +292,7 @@ def test_snapshot_macro_transformer_micro(tmp_path, config_with_transformer): run_manager_with_actors( dump(config_with_transformer), run_dir1.path, python_actors=actors) - snapshots_ymmsl = sorted(run_dir1.snapshot_dir().iterdir()) + snapshots_ymmsl = ls_snapshots(run_dir1) assert len(snapshots_ymmsl) == 8 # pick one to resume from @@ -320,5 +301,5 @@ def test_snapshot_macro_transformer_micro(tmp_path, config_with_transformer): run_manager_with_actors( dump(config_with_transformer), run_dir2.path, python_actors=actors) - snapshots_ymmsl = sorted(run_dir2.snapshot_dir().iterdir()) - assert len(snapshots_ymmsl) == 3 + snapshots_ymmsl = ls_snapshots(run_dir2) + assert len(snapshots_ymmsl) == 6 diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py index a33a785d..a4edf3be 100644 --- a/libmuscle/python/libmuscle/checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/checkpoint_triggers.py @@ -297,8 +297,8 @@ def reuse_instance(self) -> None: _checkpoint_error('You must call "should_save_final" exactly' ' once in the reuse loop of an instance that' ' supports checkpointing.') - self._should_save_final_called = False - self._saved_final_checkpoint = False + self._should_save_final_called = False + self._saved_final_checkpoint = False def update_checkpoints(self, timestamp: float, final: bool) -> None: """Update last and next checkpoint times when a snapshot is made. diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index 48bb2905..a0984fa2 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -670,6 +670,8 @@ def __check_reuse_instance(self, apply_overlay: bool) -> bool: # receive something that was sent on the last go-around. # At least emit a warning. if self.should_init() or not self._first_run: + # self.should_init() might be False in first should_save_final(), + # but self._first_run is already updated by then self.__pre_receive_f_init(apply_overlay) self._set_local_log_level() @@ -682,7 +684,8 @@ def __check_reuse_instance(self, apply_overlay: bool) -> bool: no_settings_in = not self._communicator.settings_in_connected() if f_init_not_connected and no_settings_in: - do_reuse = self._first_run + do_reuse = self._first_run and (not self.resuming() or + not self.should_init()) else: for message in self._f_init_cache.values(): if isinstance(message.data, ClosePort): diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py index f40ea48b..188814ff 100644 --- a/libmuscle/python/libmuscle/mmp_client.py +++ b/libmuscle/python/libmuscle/mmp_client.py @@ -56,7 +56,7 @@ def encode_profile_event(event: ProfileEvent) -> Any: def decode_checkpoint_rule(rule: Dict[str, Any]) -> CheckpointRule: """Decode a checkpoint rule from a MsgPack-compatible value.""" - if rule.keys() == {'in'}: + if rule.keys() == {'at'}: return CheckpointAtRule(**rule) if rule.keys() == {'start', 'stop', 'every'}: return CheckpointRangeRule(**rule) diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py index 0678c96c..0bd3de83 100644 --- a/libmuscle/python/libmuscle/snapshot_manager.py +++ b/libmuscle/python/libmuscle/snapshot_manager.py @@ -82,6 +82,10 @@ def _set_checkpoint_info(self, snapshot.is_final_snapshot) self._communicator.restore_message_counts( snapshot.port_message_counts) + # Store a copy of the snapshot in the current run directory + path = self.__store_snapshot(snapshot) + metadata = SnapshotMetadata.from_snapshot(snapshot, str(path)) + self._manager.submit_snapshot_metadata(self._instance_id, metadata) def reuse_instance(self, do_reuse: bool, f_init_max_timestamp: Optional[float] @@ -202,7 +206,7 @@ def __save_snapshot( metadata = SnapshotMetadata.from_snapshot(snapshot, str(path)) self._manager.submit_snapshot_metadata(self._instance_id, metadata) - timestamp = msg.timestamp if msg is not None else -1.0 + timestamp = msg.timestamp if msg is not None else float('-inf') if final and f_init_max_timestamp is not None: # For final snapshots f_init_max_snapshot is the reference time (see # should_save_final_snapshot). diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py index 6325cb0d..ffec4744 100644 --- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py +++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py @@ -106,7 +106,7 @@ def test_save_load_snapshot(tmp_path: Path) -> None: assert metadata.is_final_snapshot snapshot_path = Path(metadata.snapshot_filename) assert snapshot_path.parent == tmp_path - assert snapshot_path.name == 'test-1_2.pack' + assert snapshot_path.name == 'test-1_3.pack' assert snapshot_manager2.resuming() snapshot_manager2.reuse_instance(True, None) @@ -143,6 +143,8 @@ def test_save_load_implicit_snapshot(tmp_path: Path) -> None: snapshot_manager2._set_checkpoint_info( datetime.now(timezone.utc), checkpoints, snapshot_path, tmp_path) communicator.restore_message_counts.assert_called_with(port_message_counts) + manager.submit_snapshot_metadata.assert_called_once() + manager.submit_snapshot_metadata.reset_mock() assert not snapshot_manager2.resuming() snapshot_manager2.reuse_instance(True, 1.5) From b2dcfae2b131a6dca4ef404fe426214c9798ca75 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 28 Nov 2022 14:02:01 +0100 Subject: [PATCH 099/183] Fix non-deterministic CI failures --- integration_test/test_snapshot_dispatch.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/integration_test/test_snapshot_dispatch.py b/integration_test/test_snapshot_dispatch.py index 2b9d3b57..3c2c791b 100644 --- a/integration_test/test_snapshot_dispatch.py +++ b/integration_test/test_snapshot_dispatch.py @@ -108,7 +108,9 @@ def test_snapshot_dispatch(tmp_path, dispatch_config): snapshots_ymmsl = ls_snapshots(run_dir1) snapshot_docs = list(map(load, snapshots_ymmsl)) - assert len(snapshot_docs) == 16 + # More ymmsl restarts files may be possible, depending on the sequence of + # incoming SnapshotMetadata... + assert len(snapshot_docs) >= 16 # resume from the snapshots taken at t>=2.3 run_dir2 = RunDir(tmp_path / 'run2') @@ -121,4 +123,6 @@ def test_snapshot_dispatch(tmp_path, dispatch_config): assert len(ls_snapshots(run_dir2, 'comp3')) == 3 # t=2.5, 5, at_end assert len(ls_snapshots(run_dir2, 'comp4')) == 3 # t=5, 7.5, at_end assert len(ls_snapshots(run_dir2, 'comp5')) == 3 # t=7.5, 10, at_end - assert len(ls_snapshots(run_dir2)) == 13 + # More ymmsl restarts files may be possible, depending on the sequence of + # incoming SnapshotMetadata... + assert len(ls_snapshots(run_dir2)) >= 13 From 978dd6d125ff5d5a0136caef6524f59bb0f1a98c Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 28 Nov 2022 14:26:59 +0100 Subject: [PATCH 100/183] More checks to understand CI failures --- integration_test/test_snapshot_dispatch.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/integration_test/test_snapshot_dispatch.py b/integration_test/test_snapshot_dispatch.py index 3c2c791b..495bcd55 100644 --- a/integration_test/test_snapshot_dispatch.py +++ b/integration_test/test_snapshot_dispatch.py @@ -96,6 +96,7 @@ def dispatch_config(): def test_snapshot_dispatch(tmp_path, dispatch_config): actors = {f'comp{i + 1}': component for i in range(5)} + (tmp_path / 'run1').mkdir() run_dir1 = RunDir(tmp_path / 'run1') run_manager_with_actors( dump(dispatch_config), run_dir1.path, python_actors=actors) @@ -108,13 +109,20 @@ def test_snapshot_dispatch(tmp_path, dispatch_config): snapshots_ymmsl = ls_snapshots(run_dir1) snapshot_docs = list(map(load, snapshots_ymmsl)) - # More ymmsl restarts files may be possible, depending on the sequence of - # incoming SnapshotMetadata... - assert len(snapshot_docs) >= 16 + assert len(snapshot_docs) == 16 # resume from the snapshots taken at t>=2.3 + (tmp_path / 'run2').mkdir() run_dir2 = RunDir(tmp_path / 'run2') dispatch_config.update(snapshot_docs[3]) # add resume info + # validate resume info + resume = snapshot_docs[3].resume + assert resume['comp1'] == ls_snapshots(run_dir1, 'comp1')[1] + assert resume['comp2'] == ls_snapshots(run_dir1, 'comp2')[1] + assert 'comp3' not in resume + assert 'comp4' not in resume + assert 'comp5' not in resume + run_manager_with_actors( dump(dispatch_config), run_dir2.path, python_actors=actors) From 023fabef5d78aca06420dc9c8a5cfc55430ca8f2 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 28 Nov 2022 14:32:53 +0100 Subject: [PATCH 101/183] Deterministic restart for dispatch test case --- integration_test/test_snapshot_dispatch.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/integration_test/test_snapshot_dispatch.py b/integration_test/test_snapshot_dispatch.py index 495bcd55..106f6d3c 100644 --- a/integration_test/test_snapshot_dispatch.py +++ b/integration_test/test_snapshot_dispatch.py @@ -96,7 +96,6 @@ def dispatch_config(): def test_snapshot_dispatch(tmp_path, dispatch_config): actors = {f'comp{i + 1}': component for i in range(5)} - (tmp_path / 'run1').mkdir() run_dir1 = RunDir(tmp_path / 'run1') run_manager_with_actors( dump(dispatch_config), run_dir1.path, python_actors=actors) @@ -109,19 +108,15 @@ def test_snapshot_dispatch(tmp_path, dispatch_config): snapshots_ymmsl = ls_snapshots(run_dir1) snapshot_docs = list(map(load, snapshots_ymmsl)) + # More ymmsl restarts files may be possible, depending on the sequence of + # incoming SnapshotMetadata... assert len(snapshot_docs) == 16 # resume from the snapshots taken at t>=2.3 - (tmp_path / 'run2').mkdir() run_dir2 = RunDir(tmp_path / 'run2') - dispatch_config.update(snapshot_docs[3]) # add resume info - # validate resume info - resume = snapshot_docs[3].resume - assert resume['comp1'] == ls_snapshots(run_dir1, 'comp1')[1] - assert resume['comp2'] == ls_snapshots(run_dir1, 'comp2')[1] - assert 'comp3' not in resume - assert 'comp4' not in resume - assert 'comp5' not in resume + dispatch_config.resume = { + 'comp1': ls_snapshots(run_dir1, 'comp1')[1], + 'comp2': ls_snapshots(run_dir1, 'comp2')[1]} run_manager_with_actors( dump(dispatch_config), run_dir2.path, python_actors=actors) From 4675ba052d17ed0bd38b7073ef51d49e56e6d17f Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 28 Nov 2022 16:39:03 +0100 Subject: [PATCH 102/183] Snapshot tests for interact & scale bridge --- .../examples/python/interact_coupling.py | 85 ++++++++- integration_test/test_snapshot_interact.py | 164 ++++++++++++++++++ 2 files changed, 242 insertions(+), 7 deletions(-) create mode 100644 integration_test/test_snapshot_interact.py diff --git a/docs/source/examples/python/interact_coupling.py b/docs/source/examples/python/interact_coupling.py index ff9408c7..3df5e11e 100644 --- a/docs/source/examples/python/interact_coupling.py +++ b/docs/source/examples/python/interact_coupling.py @@ -1,5 +1,5 @@ import logging -from typing import Any, Optional, Tuple +from typing import Any, Optional, Tuple, Dict from libmuscle import Instance, Message from libmuscle.runner import run_simulation @@ -129,7 +129,8 @@ class Peer: via the instance object. """ def __init__( - self, instance: Instance, in_port: str, out_port: str) -> None: + self, instance: Instance, in_port: str, out_port: str, + resume_from_state: Any = None) -> None: """Create a Peer object. This also receives an initial message from the peer model, and @@ -145,11 +146,20 @@ def __init__( self.out_port = out_port self.cache = DataCache() - msg = self.instance.receive(self.in_port) - self.cache.add_data(msg.timestamp, msg.data) - self.rcvd = msg.timestamp - self.to_send = msg.timestamp - self.next = msg.next_timestamp + if resume_from_state: + self.cache.t_cur = resume_from_state['cache.t_cur'] + self.cache.data_cur = resume_from_state['cache.data_cur'] + self.cache.t_next = resume_from_state['cache.t_next'] + self.cache.data_next = resume_from_state['cache.data_next'] + self.rcvd = resume_from_state['rcvd'] + self.to_send = resume_from_state['to_send'] + self.next = resume_from_state['next'] + else: + msg = self.instance.receive(self.in_port) + self.cache.add_data(msg.timestamp, msg.data) + self.rcvd = msg.timestamp + self.to_send = msg.timestamp + self.next = msg.next_timestamp def done(self) -> bool: """Return whether we are done commmunicating with this peer.""" @@ -200,6 +210,17 @@ def send(self, t: float, data: Any) -> None: self.instance.send(self.out_port, Message(t, self.next, data)) self.to_send = self.next + def get_state(self) -> Dict[str, Any]: + """Return the current state of this object as a MUSCLE-serializable dict + """ + return {'cache.t_cur': self.cache.t_cur, + 'cache.data_cur': self.cache.data_cur, + 'cache.t_next': self.cache.t_next, + 'cache.data_next': self.cache.data_next, + 'rcvd': self.rcvd, + 'to_send': self.to_send, + 'next': self.next} + def temporal_coupler() -> None: """Model component connecting two scale-overlapping submodels. @@ -241,6 +262,56 @@ def temporal_coupler() -> None: b.send(t, data) +def checkpointing_temporal_coupler() -> None: + """Model component connecting two scale-overlapping submodels. + + This component sits in between two scale-overlapping submodels + running at different (and potentially variable) timesteps and + ensures that each of these peers receives a message whenever it + expects one, and can send a message whenever it expects to do so. + + This function extends :func:`temporal_coupler` with checkpointing + capabilities. + """ + instance = Instance({ + Operator.O_I: ['a_out', 'b_out'], + Operator.S: ['a_in', 'b_in']}) + + while instance.reuse_instance(): + if instance.resuming(): + state = instance.load_snapshot().data + if state is not None: + a = Peer(instance, 'a_in', 'a_out', state['a']) + b = Peer(instance, 'b_in', 'b_out', state['b']) + + if instance.should_init(): + # Receive initial messages and initialise state + a = Peer(instance, 'a_in', 'a_out') + b = Peer(instance, 'b_in', 'b_out') + + # Send and receive as needed + while not a.done() or not b.done(): + if a.can_receive(): + a.receive() + elif b.can_receive(): + b.receive() + elif a.can_send(b.rcvd, b.next): + t, data = b.cache.get_data(a.to_send) + a.send(t, data) + elif b.can_send(a.rcvd, a.next): + t, data = a.cache.get_data(b.to_send) + b.send(t, data) + + t_cur = min(a.rcvd, b.rcvd) + if instance.should_save_snapshot(t_cur): + instance.save_snapshot(Message( + t_cur, None, {'a': a.get_state(), 'b': b.get_state()})) + + t_cur = min(a.rcvd, b.rcvd) + if instance.should_save_final_snapshot(): + instance.save_final_snapshot(Message(t_cur, None, None)) + + if __name__ == '__main__': logging.basicConfig() logging.getLogger().setLevel(logging.INFO) diff --git a/integration_test/test_snapshot_interact.py b/integration_test/test_snapshot_interact.py new file mode 100644 index 00000000..5492f9e2 --- /dev/null +++ b/integration_test/test_snapshot_interact.py @@ -0,0 +1,164 @@ +import logging +import sys +from pathlib import Path + +import pytest +from ymmsl import Operator, load, dump + +from libmuscle import Instance, Message +from libmuscle.manager.run_dir import RunDir + +from .conftest import run_manager_with_actors, ls_snapshots + +# Make interact_coupling.py available (from docs/sources/examples) +sys.path.append(str( + Path(__file__).parents[1] / 'docs' / 'source' / 'examples' / 'python')) +import interact_coupling # noqa + +_LOG_LEVEL = 'INFO' # set to DEBUG for additional debug info + + +def component(): + instance = Instance({ + Operator.O_I: ['o_i'], + Operator.S: ['s']}) + + while instance.reuse_instance(): + t0 = instance.get_setting('t0', 'float') + dt = instance.get_setting('dt', 'float') + t_max = instance.get_setting('t_max', 'float') + + if instance.resuming(): + msg = instance.load_snapshot() + t_cur = msg.timestamp + i, t_stop = msg.data + + if instance.should_init(): + t_cur = t0 + i = 0 + t_stop = t0 + t_max + + rcvd_i = 0 + while t_cur < t_stop: + # faux time-integration for testing snapshots + t_next = t_cur + dt + if t_next >= t_stop: + t_next = None + logging.info(f'Sending {i} at {t_cur}, next at {t_next}') + instance.send('o_i', Message(t_cur, t_next, i)) + + msg = instance.receive('s') + logging.info( + f'Received {msg.data} from time {msg.timestamp},' + f' next at {msg.next_timestamp}') + assert msg.data >= rcvd_i + rcvd_i = msg.data + + t_cur += dt + i += 1 + + if instance.should_save_snapshot(t_cur): + instance.save_snapshot(Message(t_cur, None, [i, t_stop])) + + if instance.should_save_final_snapshot(): + instance.save_final_snapshot(Message(t_cur, None, [i, t_stop])) + + +def test_snapshot_interact_lockstep(tmp_path): + config = f"""ymmsl_version: v0.1 +model: + name: test_snapshot + components: + comp1: component + comp2: component + conduits: + comp1.o_i: comp2.s + comp2.o_i: comp1.s +settings: + t0: 0.35 + dt: 0.1234 + t_max: 3.0 + muscle_remote_log_level: {_LOG_LEVEL} +checkpoints: + simulation_time: + - every: 1.0 + start: 0.75 + stop: 2.0 + - at: + - 2.5""" + actors = {f'comp{i + 1}': component for i in range(2)} + + run_dir1 = RunDir(tmp_path / 'run1') + run_manager_with_actors(config, run_dir1.path, python_actors=actors) + + assert len(ls_snapshots(run_dir1, 'comp1')) == 3 # t=0.75, 1.75, 2.5 + assert len(ls_snapshots(run_dir1, 'comp2')) == 3 # t=0.75, 1.75, 2.5 + + snapshots_ymmsl = ls_snapshots(run_dir1) + snapshot_docs = list(map(load, snapshots_ymmsl)) + assert len(snapshot_docs) == 3 + + # resume from the snapshots taken at t>=1.75 + run_dir2 = RunDir(tmp_path / 'run2') + config_doc = load(config) + config_doc.update(snapshot_docs[1]) + + run_manager_with_actors( + dump(config_doc), run_dir2.path, python_actors=actors) + + assert len(ls_snapshots(run_dir2, 'comp1')) == 2 # resume, t=2.5 + assert len(ls_snapshots(run_dir2, 'comp2')) == 2 # resume, t=2.5 + assert len(ls_snapshots(run_dir2)) == 2 + + +@pytest.mark.parametrize('scale', [0.1, 0.9, 1.0, 1.1, 1.5]) +def test_snapshot_interact_varstep(tmp_path, scale): + config = f"""ymmsl_version: v0.1 +model: + name: test_snapshot + components: + comp1: component + comp2: component + coupler: checkpointing_temporal_coupler + conduits: + comp1.o_i: coupler.a_in + coupler.a_out: comp1.s + comp2.o_i: coupler.b_in + coupler.b_out: comp2.s +settings: + t0: 0.35 + comp1.dt: 0.1234 + comp2.dt: {0.1234 * scale} + t_max: 3.0 + muscle_remote_log_level: {_LOG_LEVEL} +checkpoints: + simulation_time: + - every: 1.0 + start: 0.75 + stop: 2.0 + - at: + - 2.5""" + actors = {f'comp{i + 1}': component for i in range(2)} + actors['coupler'] = interact_coupling.checkpointing_temporal_coupler + + run_dir1 = RunDir(tmp_path / 'run1') + run_manager_with_actors(config, run_dir1.path, python_actors=actors) + + assert len(ls_snapshots(run_dir1, 'comp1')) == 3 # t=0.75, 1.75, 2.5 + assert len(ls_snapshots(run_dir1, 'comp2')) == 3 # t=0.75, 1.75, 2.5 + + snapshots_ymmsl = ls_snapshots(run_dir1) + snapshot_docs = list(map(load, snapshots_ymmsl)) + assert len(snapshot_docs) == 3 + + # resume from the snapshots taken at t>=1.75 + run_dir2 = RunDir(tmp_path / 'run2') + config_doc = load(config) + config_doc.update(snapshot_docs[1]) + + run_manager_with_actors( + dump(config_doc), run_dir2.path, python_actors=actors) + + assert len(ls_snapshots(run_dir2, 'comp1')) == 2 # resume, t=2.5 + assert len(ls_snapshots(run_dir2, 'comp2')) == 2 # resume, t=2.5 + assert len(ls_snapshots(run_dir2)) == 2 From 1b24bdddbacca7d4e1be00c1fc18ba7449dae2b7 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 29 Nov 2022 16:26:35 +0100 Subject: [PATCH 103/183] Fix message=None cases in cmdline tool --- muscle3/muscle3.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/muscle3/muscle3.py b/muscle3/muscle3.py index 484e4335..f715d5fd 100644 --- a/muscle3/muscle3.py +++ b/muscle3/muscle3.py @@ -133,7 +133,8 @@ def snapshot( typ = 'Final' if snapshot.is_final_snapshot else 'Intermediate' properties = OrderedDict([ ('Snapshot type', typ), - ('Snapshot timestamp', snapshot.message.timestamp), + ('Snapshot timestamp', + snapshot.message.timestamp if snapshot.message else float('-inf')), ('Snapshot wallclock time', snapshot.wallclock_time), ('Snapshot triggers', snapshot.triggers), ]) @@ -146,7 +147,10 @@ def snapshot( click.echo(prop_value) if data: click.secho('Snapshot data:', bold=True) - click.echo(snapshot.message.data) + if snapshot.message is not None: + click.echo(snapshot.message.data) + else: + click.secho("No data available", italic=True) click.echo() From 0106b2193e8431271051bcb61d4f6df6cc657ee4 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 29 Nov 2022 16:27:25 +0100 Subject: [PATCH 104/183] Add complex coupling checkpointing test Checkpointing based on wallclock_time. --- .../test_snapshot_complex_coupling.py | 185 ++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 integration_test/test_snapshot_complex_coupling.py diff --git a/integration_test/test_snapshot_complex_coupling.py b/integration_test/test_snapshot_complex_coupling.py new file mode 100644 index 00000000..84ffc4af --- /dev/null +++ b/integration_test/test_snapshot_complex_coupling.py @@ -0,0 +1,185 @@ +import random +import time + +import pytest +from ymmsl import ImplementationState, Operator, load, dump + +from libmuscle import Instance, Message +from libmuscle.manager.run_dir import RunDir + +from .conftest import run_manager_with_actors, ls_snapshots + + +_LOG_LEVEL = 'INFO' # set to DEBUG for additional debug info + + +def cache_component(max_channels=2): + ports = {Operator.F_INIT: [f'in{i+1}' for i in range(max_channels)], + Operator.O_I: [f'sub_out{i+1}' for i in range(max_channels)], + Operator.S: [f'sub_in{i+1}' for i in range(max_channels)], + Operator.O_F: [f'out{i+1}' for i in range(max_channels)]} + instance = Instance(ports) + + cache_t = float('-inf') + cache_data = [] + max_cache_age = None + while instance.reuse_instance(): + cache_valid_range = instance.get_setting('cache_valid', '[float]') + if max_cache_age is None: + max_cache_age = random.uniform(*cache_valid_range) + + msgs = [instance.receive(port) if instance.is_connected(port) else None + for port in ports[Operator.F_INIT]] + cur_t = msgs[0].timestamp + + if cur_t - cache_t >= max_cache_age: + # Cached value is no longer valid, run submodel for updated data + for msg, port in zip(msgs, ports[Operator.O_I]): + if msg is not None: + instance.send(port, Message(cur_t, None, msg.data)) + cache_data = [instance.receive(port).data + if instance.is_connected(port) else None + for port in ports[Operator.S]] + cache_t = cur_t + max_cache_age = random.uniform(*cache_valid_range) + + for data, port in zip(cache_data, ports[Operator.O_F]): + if data is not None: + instance.send(port, Message(cur_t, None, data)) + + if instance.should_save_final_snapshot(): + instance.save_final_snapshot(Message(cur_t, None, [])) + + +def echo_component(max_channels=2): + ports = {Operator.F_INIT: [f'in{i+1}' for i in range(max_channels)], + Operator.O_F: [f'out{i+1}' for i in range(max_channels)]} + instance = Instance(ports, stateful=ImplementationState.STATELESS) + + while instance.reuse_instance(): + for p_in, p_out in zip(ports[Operator.F_INIT], ports[Operator.O_F]): + if instance.is_connected(p_in): + instance.send(p_out, instance.receive(p_in)) + + +def main_component(): + instance = Instance({ + Operator.O_I: ['state_out'], + Operator.S: ['Ai', 'Bi', 'Ci', 'Di'], + Operator.O_F: ['o_f']}) + + while instance.reuse_instance(): + dt = instance.get_setting('dt', 'float') + t_max = instance.get_setting('t_max', 'float') + + if instance.resuming(): + msg = instance.load_snapshot() + t_cur = msg.timestamp + i, t_remaining = msg.data + monotonic_end = time.monotonic() + t_remaining + + if instance.should_init(): + t_cur = 0 + monotonic_end = time.monotonic() + t_max + i = 0 + + while time.monotonic() < monotonic_end: + instance.send('state_out', Message(t_cur, None, i)) + for port in ('Ai', 'Bi', 'Ci', 'Di'): + instance.receive(port) + + t_cur += dt + i += 1 + time.sleep(0.05) + + if instance.should_save_snapshot(t_cur): + instance.save_snapshot(Message( + t_cur, None, [i, monotonic_end - time.monotonic()])) + + instance.send('o_f', Message(t_cur, None, i)) + + if instance.should_save_final_snapshot(): + instance.save_final_snapshot(Message(t_cur, None, [i, 0])) + + +@pytest.fixture +def config(): + return load(f"""ymmsl_version: v0.1 +model: + name: test_snapshot + components: + main: main_component + cacheA: cache_component + cacheB: cache_component + cacheC: cache_component + calcA: echo_component + calcB: echo_component + calcC: echo_component + calcD: echo_component + conduits: + main.state_out: + - cacheA.in1 + - cacheB.in1 + - cacheC.in1 + - calcD.in1 + + cacheA.out1: main.Ai + cacheA.out2: main.Bi + cacheA.sub_out1: calcA.in1 + cacheA.sub_out2: calcA.in2 + calcA.out1: cacheA.sub_in1 + calcA.out2: cacheA.sub_in2 + + cacheB.out1: + - cacheA.in2 + - cacheC.in2 + cacheB.sub_out1: calcB.in1 + calcB.out1: cacheB.sub_in1 + + cacheC.out1: main.Ci + cacheC.sub_out1: calcC.in1 + cacheC.sub_out2: calcC.in2 + calcC.out1: cacheC.sub_in1 + + calcD.out1: main.Di + +settings: + dt: 1.234 + t_max: 2.0 # seconds + cacheA.cache_valid: [2.0, 5.0] + cacheB.cache_valid: [3.0, 8.0] + cacheC.cache_valid: [4.0, 10.0] + muscle_remote_log_level: {_LOG_LEVEL} +checkpoints: + at_end: true + wallclock_time: + - every: 0.5""") + + +def test_snapshot_complex_coupling(tmp_path, config): + actors = {'main': main_component} + for c in 'ABC': + actors['cache' + c] = cache_component + for c in 'ABCD': + actors['calc' + c] = echo_component + + run_dir1 = RunDir(tmp_path / 'run1') + run_manager_with_actors( + dump(config), run_dir1.path, python_actors=actors) + + assert len(ls_snapshots(run_dir1, 'main')) == 5 # 2.0/0.5, at_end + assert len(ls_snapshots(run_dir1, 'cacheA')) == 5 # 2.0/0.5, at_end + assert len(ls_snapshots(run_dir1, 'cacheB')) == 5 # 2.0/0.5, at_end + assert len(ls_snapshots(run_dir1, 'cacheC')) == 5 # 2.0/0.5, at_end + # Due to caches, calcA/B/C may not run every 0.5 seconds + assert 1 <= len(ls_snapshots(run_dir1, 'calcA')) <= 5 + assert 1 <= len(ls_snapshots(run_dir1, 'calcB')) <= 5 + assert 1 <= len(ls_snapshots(run_dir1, 'calcC')) <= 5 + assert len(ls_snapshots(run_dir1, 'calcD')) == 5 # 2.0/0.5, at_end + + snapshots_ymmsl = ls_snapshots(run_dir1) + snapshot_docs = list(map(load, snapshots_ymmsl)) + # Snapshots based on wallclock time are less reliable. There is at least one + # resume yMMSL: the at_end collection. At most 4 more, one for each + # wallclock_time checkpoint. + assert 1 <= len(snapshot_docs) <= 5 From adb3b3ab12b0fdf5f1014e1aa6a59fb7410d84b0 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Wed, 30 Nov 2022 13:34:30 +0100 Subject: [PATCH 105/183] Remove outdated comments --- libmuscle/python/libmuscle/communicator.py | 3 --- libmuscle/python/libmuscle/manager/snapshot_registry.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py index 5c68ca4d..bf1cf33e 100644 --- a/libmuscle/python/libmuscle/communicator.py +++ b/libmuscle/python/libmuscle/communicator.py @@ -324,8 +324,6 @@ def receive_message(self, port_name: str, slot: Optional[int] = None, profile_event.message_size = len(mcp_message_bytes) expected_message_number = port.get_num_messages(slot) - # TODO: handle f_init port counts for STATELESS and WEAKLY_STATEFUL - # components which didn't load a snapshot if expected_message_number != mcp_message.message_number: if (expected_message_number - 1 == mcp_message.message_number and port.is_resuming(slot)): @@ -388,7 +386,6 @@ def restore_message_counts(self, port_message_counts: Dict[str, List[int]] raise RuntimeError(f'Unknown port {port_name} in snapshot.' ' Have your port definitions changed since' ' the snapshot was taken?') - # TODO decide if we should check whether all ports are covered def get_message_counts(self) -> Dict[str, List[int]]: """Get message counts for all ports on the communicator diff --git a/libmuscle/python/libmuscle/manager/snapshot_registry.py b/libmuscle/python/libmuscle/manager/snapshot_registry.py index dcf7c3e8..3883ea9a 100644 --- a/libmuscle/python/libmuscle/manager/snapshot_registry.py +++ b/libmuscle/python/libmuscle/manager/snapshot_registry.py @@ -161,7 +161,7 @@ def do_consistency_check( else: consistent = calc_consistency_list( i_msg_counts, p_msg_counts, is_sending, peer_is_restart) - if not consistent: # not consistent + if not consistent: return False self.consistent_peers.setdefault( peer_node.instance, []).append(peer_node) From 876e6267b354320707b3e33dd85487c660d37435 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Thu, 1 Dec 2022 09:43:52 +0100 Subject: [PATCH 106/183] Remove Python 3.6 support --- .github/workflows/ci.yaml | 4 ++-- .../workflows/ci_python_compatibility.yaml | 7 +------ .github/workflows/ci_ubuntu18.04.yaml | 19 ------------------- .github/workflows/ci_ubuntu18.04_clang.yaml | 19 ------------------- docs/source/examples/python/requirements.txt | 1 - docs/source/installing.rst.in | 2 +- setup.py | 4 +--- tox.ini | 3 +-- 8 files changed, 6 insertions(+), 53 deletions(-) delete mode 100644 .github/workflows/ci_ubuntu18.04.yaml delete mode 100644 .github/workflows/ci_ubuntu18.04_clang.yaml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 351ede6c..16b81e96 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -8,10 +8,10 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Set up Python 3.6 + - name: Set up Python 3.8 uses: actions/setup-python@v1 with: - python-version: 3.6 + python-version: 3.8 - name: Install dependencies run: | diff --git a/.github/workflows/ci_python_compatibility.yaml b/.github/workflows/ci_python_compatibility.yaml index bdc2e86b..89bc9126 100644 --- a/.github/workflows/ci_python_compatibility.yaml +++ b/.github/workflows/ci_python_compatibility.yaml @@ -6,7 +6,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.6", "3.7", "3.8", "3.9", "3.10"] + python-version: ["3.7", "3.8", "3.9", "3.10"] steps: - name: Check out the source code @@ -30,8 +30,3 @@ jobs: - name: Run the test suite run: make test_python_only - - - name: Upload coverage report to Codacy - uses: codacy/codacy-coverage-reporter-action@master - with: - project-token: ${{ secrets.CODACY_PROJECT_TOKEN }} diff --git a/.github/workflows/ci_ubuntu18.04.yaml b/.github/workflows/ci_ubuntu18.04.yaml deleted file mode 100644 index 9e0448ac..00000000 --- a/.github/workflows/ci_ubuntu18.04.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Run Continuous Integration for the latest Ubuntu release -# This mainly checks for issues/regressions in the native build -name: native_compatibility_ubuntu18.04 -on: - schedule: - - cron: '0 2 * * 0' - push: - branches: - - 'release-*' - - fix_native_compatibility_ci -jobs: - build: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v2 - - - name: Run tests on Ubuntu 18.04 - run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:18.04 /bin/bash -c 'apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential cmake gfortran git valgrind libopenmpi-dev pkg-config python3 python3-pip python3-venv curl && apt-get -y remove libssl-dev zlib1g-dev && useradd -m -d /home/muscle3 muscle3 && echo "Added user" && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && echo "Copied files" && su muscle3 -c -- "pip3 install --user -U \"pip<22\" setuptools wheel" && su muscle3 -c -- "pip3 install --user \"ymmsl>=0.12.0,<0.13\" qcg-pilotjob==0.13.1" && su muscle3 -c -- "cd /home/muscle3/muscle3 && make test_examples"' diff --git a/.github/workflows/ci_ubuntu18.04_clang.yaml b/.github/workflows/ci_ubuntu18.04_clang.yaml deleted file mode 100644 index 49864bc2..00000000 --- a/.github/workflows/ci_ubuntu18.04_clang.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# Run Continuous Integration for the latest Ubuntu release -# This mainly checks for issues/regressions in the native build -name: native_compatibility_ubuntu18.04_clang -on: - schedule: - - cron: '30 2 * * 0' - push: - branches: - - 'release-*' - - fix_native_compatibility_ci -jobs: - build: - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v2 - - - name: Run tests on Ubuntu 18.04 with Clang - run: docker run -v "${GITHUB_WORKSPACE}:/workspace" --env LC_ALL=C.UTF-8 --env LANG=C.UTF-8 --env DEBIAN_FRONTEND=noninteractive ubuntu:18.04 /bin/bash -c 'apt-get update && apt-get -y dist-upgrade && apt-get -y install build-essential clang cmake gfortran git valgrind libopenmpi-dev pkg-config python3 python3-pip python3-venv curl && apt-get -y remove libssl-dev zlib1g-dev && useradd -m -d /home/muscle3 muscle3 && echo "Added user" && su muscle3 -c -- "cp -r --preserve=mode /workspace /home/muscle3/muscle3" && echo "Copied files" && su muscle3 -c -- "pip3 install --user -U \"pip<22\" setuptools wheel" && su muscle3 -c -- "pip3 install --user \"ymmsl>=0.12.0,<0.13\" qcg-pilotjob==0.13.1" && su muscle3 -c -- "cd /home/muscle3/muscle3 && CXXFLAGS=-fPIE OMPI_CXX=clang++ CXX=clang++ make test_examples"' diff --git a/docs/source/examples/python/requirements.txt b/docs/source/examples/python/requirements.txt index 8e2ef00e..fa14df52 100644 --- a/docs/source/examples/python/requirements.txt +++ b/docs/source/examples/python/requirements.txt @@ -1,5 +1,4 @@ matplotlib>=3,<4 -numpy==1.19.5; python_version=='3.6' numpy<1.22; python_version=='3.7' numpy>=1.22,<=1.25; python_version>='3.8' sobol_seq==0.2.0 diff --git a/docs/source/installing.rst.in b/docs/source/installing.rst.in index cbafedad..25620d03 100644 --- a/docs/source/installing.rst.in +++ b/docs/source/installing.rst.in @@ -12,7 +12,7 @@ Python Installing MUSCLE3 on Python will install all the Python-based components of the system, i.e. the Python version of libmuscle, the YMMSL Python library, and -the MUSCLE Manager. This requires at least Python 3.6. +the MUSCLE Manager. This requires at least Python 3.7. MUSCLE3 is on PyPI as an ordinary Python package, so it can be installed via Pip in the usual way. It's normally a good idea to make a virtual environment diff --git a/setup.py b/setup.py index b99e2d06..467e2595 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,6 @@ 'Development Status :: 4 - Beta', 'License :: OSI Approved :: Apache Software License', 'Operating System :: POSIX :: Linux', - 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', @@ -45,12 +44,11 @@ 'muscle_manager=muscle3.muscle_manager:manage_simulation', 'muscle3=muscle3.muscle3:muscle3'] }, - python_requires='>=3.6, <4', + python_requires='>=3.7, <4', install_requires=[ 'click>=7.1,<9', 'msgpack>=1,<2', 'netifaces==0.11.0', - "numpy==1.19.5; python_version=='3.6'", "numpy<1.22; python_version=='3.7'", "numpy>=1.22,<=1.25; python_version>='3.8'", 'qcg-pilotjob==0.13.1', diff --git a/tox.ini b/tox.ini index 9c6c3968..d556d6ee 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py36, py37, py38, py39, py310 +envlist = py37, py38, py39, py310 skip_missing_interpreters = true [testenv] @@ -22,7 +22,6 @@ commands = [gh-actions] python = - 3.6: py36 3.7: py37 3.8: py38 3.9: py39 From 96e64737914bb428eeec9a81e1af8e01336ff965 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Thu, 1 Dec 2022 10:00:51 +0100 Subject: [PATCH 107/183] Limit flake8 to <6 for now (#137) --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index d556d6ee..6f71615f 100644 --- a/tox.ini +++ b/tox.ini @@ -5,7 +5,7 @@ skip_missing_interpreters = true [testenv] deps = mypy - flake8 + flake8<6 pytest pytest-cov git+https://github.com/multiscale/ymmsl-python.git@feature/multicast#egg=ymmsl From 3a55125d884f4446bb1fe053c0594e50a6c3cf94 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 1 Dec 2022 13:13:11 +0100 Subject: [PATCH 108/183] Remove python 3.6 support --- setup.py | 1 - tox.ini | 1 - 2 files changed, 2 deletions(-) diff --git a/setup.py b/setup.py index 9b564beb..467e2595 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,6 @@ "numpy>=1.22,<=1.25; python_version>='3.8'", 'qcg-pilotjob==0.13.1', 'typing_extensions<4', - "dataclasses; python_version=='3.6'", 'ymmsl>=0.12.0,<0.13' # Also in CI, update there as well ], extras_require={ diff --git a/tox.ini b/tox.ini index 54b84ba9..006e8901 100644 --- a/tox.ini +++ b/tox.ini @@ -9,7 +9,6 @@ deps = pytest pytest-cov git+https://github.com/multiscale/ymmsl-python.git@feature/checkpointing#egg=ymmsl - types-dataclasses; python_version=='3.6' passenv = MUSCLE_TEST_PYTHON_ONLY From 48f72b51e7c067f10b0a0bd86f23803018763053 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 1 Dec 2022 15:16:03 +0100 Subject: [PATCH 109/183] Add links to the github page in contributing.rst --- docs/source/contributing.rst | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst index c3132e5e..f10fe77b 100644 --- a/docs/source/contributing.rst +++ b/docs/source/contributing.rst @@ -22,7 +22,7 @@ proceed as below. Make an Issue ============= -Issues are found in a tab at the top of the repository home page. Please check +`Issues`_ are found in a tab at the top of `the repository home page`_. Please check to see that the bug you want to fix or the feature you want to add does not already have an issue dedicated to it. If it does, feel free to add to the discussion. If not, please make a new issue. @@ -48,6 +48,9 @@ describe If you want to fix the bug or implement the feature yourself, you'll have to set up a development environment. +.. _Issues: https://github.com/multiscale/muscle3/issues +.. _the repository home page: https://github.com/multiscale/muscle3/ + Get a local repository ====================== From 52149ca8976f60f356055713889145f7efaecdb6 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 2 Dec 2022 10:45:49 +0100 Subject: [PATCH 110/183] Use sphinx-click to generate cmdline tool docs --- docs/requirements.txt | 1 + docs/source/command_line_tools.rst | 12 ++++++++++++ docs/source/conf.py | 3 ++- docs/source/index.rst | 1 + tox.ini | 1 + 5 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 docs/source/command_line_tools.rst diff --git a/docs/requirements.txt b/docs/requirements.txt index 98b8214f..5d61eae3 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -7,5 +7,6 @@ qcg-pilotjob six sphinx-fortran sphinx-tabs +sphinx-click typing==3.6.6 ymmsl diff --git a/docs/source/command_line_tools.rst b/docs/source/command_line_tools.rst new file mode 100644 index 00000000..e98e7552 --- /dev/null +++ b/docs/source/command_line_tools.rst @@ -0,0 +1,12 @@ +Command line tools +================== + +.. click:: muscle3.muscle_manager:manage_simulation + :prog: muscle_manager + :nested: full + + +.. click:: muscle3.muscle3:muscle3 + :prog: muscle3 + :nested: full + diff --git a/docs/source/conf.py b/docs/source/conf.py index 7464d1c0..56761f46 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -51,7 +51,8 @@ 'sphinx.ext.todo', 'sphinx.ext.viewcode', 'sphinxfortran.fortran_domain', - 'sphinx_tabs.tabs'] + 'sphinx_tabs.tabs', + 'sphinx_click'] # Add any paths that contain templates here, relative to this directory. # templates_path = ['_templates'] diff --git a/docs/source/index.rst b/docs/source/index.rst index ed55ba48..9b1a093b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -41,6 +41,7 @@ Cham. ``_ python_api cpp_api fortran_api + command_line_tools contributing devtools diff --git a/tox.ini b/tox.ini index 6f71615f..045f7e5b 100644 --- a/tox.ini +++ b/tox.ini @@ -41,5 +41,6 @@ deps = sphinx-fortran sphinx-tabs sphinx_rtd_theme + sphinx-click commands = sphinx-build docs/source docs/build -bhtml From bd1386c1d34945968f218cabe2f347d8178e421a Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 5 Dec 2022 14:13:02 +0100 Subject: [PATCH 111/183] Fix should_save_final_snapshot when not reusing ClosePort messages have `inf` timestamps, so would always trigger a final snapshot. Only expected when `at_end` checkpoints should be taken. --- libmuscle/python/libmuscle/checkpoint_triggers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py index a4edf3be..4299977a 100644 --- a/libmuscle/python/libmuscle/checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/checkpoint_triggers.py @@ -256,9 +256,10 @@ def should_save_final_snapshot( self.__check_should_have_saved() value = False - if not do_reuse and self._checkpoint_at_end: - value = True - self._last_triggers.append('at_end') + if not do_reuse: + if self._checkpoint_at_end: + value = True + self._last_triggers.append('at_end') elif f_init_max_timestamp is None: # No F_INIT messages received: reuse triggered on muscle_settings_in # message. From 91d2aaad82014d38d1a385d516dc1f300f062d16 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Mon, 2 Jan 2023 15:03:30 +0100 Subject: [PATCH 112/183] Fix leftover value from previous protocol design --- libmuscle/python/libmuscle/test/test_mmp_client.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/libmuscle/python/libmuscle/test/test_mmp_client.py b/libmuscle/python/libmuscle/test/test_mmp_client.py index a47311a6..d5051962 100644 --- a/libmuscle/python/libmuscle/test/test_mmp_client.py +++ b/libmuscle/python/libmuscle/test/test_mmp_client.py @@ -1,4 +1,3 @@ -from datetime import datetime, timezone from unittest.mock import patch import msgpack @@ -74,10 +73,7 @@ def test_get_settings(mocked_mmp_client) -> None: def test_register_instance(mocked_mmp_client) -> None: client, stub = mocked_mmp_client - result = [ResponseType.SUCCESS.value, - (datetime.now(timezone.utc).timestamp(), - {'wallclock_time': [], 'simulation_time': []}, - None)] + result = [ResponseType.SUCCESS.value] stub.call.return_value = msgpack.packb(result, use_bin_type=True) client.register_instance( From b58567b1875a0b10baac88866822081747e1a490 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Mon, 2 Jan 2023 15:06:05 +0100 Subject: [PATCH 113/183] Fix a few typos --- libmuscle/cpp/src/libmuscle/communicator.cpp | 2 +- libmuscle/python/libmuscle/instance.py | 2 +- libmuscle/python/libmuscle/manager/mmp_server.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libmuscle/cpp/src/libmuscle/communicator.cpp b/libmuscle/cpp/src/libmuscle/communicator.cpp index 25526e4b..269a1139 100644 --- a/libmuscle/cpp/src/libmuscle/communicator.cpp +++ b/libmuscle/cpp/src/libmuscle/communicator.cpp @@ -219,7 +219,7 @@ Message Communicator::receive_message( if (slot.is_set()) logger_.debug("Discarding received message on ", port_name, "[", slot.get(), "]: resuming from weakly", - " constistent snapshot"); + " consistent snapshot"); else logger_.debug("Discarding received message on ", port_name, ": resuming from weakly constistent snapshot"); diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index a0984fa2..245775e8 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -661,7 +661,7 @@ def __check_reuse_instance(self, apply_overlay: bool) -> bool: """Pre-receive F_INIT messages and detect if this instance is reused. This is called during :meth:`should_save_final_snapshot` to detect if a - snapshot must be taken. If an instance does implement checkpointing, + snapshot must be taken. If an instance doesn't implement checkpointing, :meth:`reuse_instance` will call it instead. """ do_reuse = self.__receive_settings() diff --git a/libmuscle/python/libmuscle/manager/mmp_server.py b/libmuscle/python/libmuscle/manager/mmp_server.py index 90617fae..f689e6f5 100644 --- a/libmuscle/python/libmuscle/manager/mmp_server.py +++ b/libmuscle/python/libmuscle/manager/mmp_server.py @@ -288,7 +288,7 @@ def _get_checkpoint_info(self, instance_id: str) -> Any: status (ResponseType): SUCCESS wallclock_reference_time (float): Unix timestamp (in UTC) indicating wallclock time of the start of the workflow. - checkpoints (dict): Dictionary encdoing a ymmsl.Checkpoints object. + checkpoints (dict): Dictionary encoding a ymmsl.Checkpoints object. resume_path (Optional[str]): Checkpoint filename to resume from. snapshot_directory (Optional[str]): Directory to store instance snapshots. From 32b24930c8e52cd8aeccfcece8d7102ffac59705 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Mon, 2 Jan 2023 15:06:35 +0100 Subject: [PATCH 114/183] Improve comments to not duplicate typo annotations --- libmuscle/python/libmuscle/mmp_client.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py index 188814ff..14e83e9a 100644 --- a/libmuscle/python/libmuscle/mmp_client.py +++ b/libmuscle/python/libmuscle/mmp_client.py @@ -74,15 +74,15 @@ def decode_checkpoint_info( Args: reference_timestamp: seconds since UNIX epoch in UTC timezone to use as wallclock_time = 0 - checkpoints_dict: dictionary of checkpoint definitions - resume: optional string indicating resume path - snapshot_dir: optional string indicating path to store snapshots in + checkpoints_dict: checkpoint definitions from the MsgPack + resume: path to the snapshot we should resume from, if any + snapshot_dir: path to the directory to store new snapshots in Returns: wallclock_time_reference: UTC time where wallclock_time = 0 checkpoints: checkpoint configuration resume: path to the resume snapshot - snapshot_dir: optional path to store snapshots in + snapshot_dir: path to store the snapshots in """ ref_time = datetime.fromtimestamp(reference_timestamp, tz=timezone.utc) checkpoints = Checkpoints( From 0409516574975dce2eac2641e868c9439986954a Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Mon, 2 Jan 2023 15:07:37 +0100 Subject: [PATCH 115/183] Use default message instead of checking for connection --- integration_test/test_snapshot_complex_coupling.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/integration_test/test_snapshot_complex_coupling.py b/integration_test/test_snapshot_complex_coupling.py index 84ffc4af..79b112d4 100644 --- a/integration_test/test_snapshot_complex_coupling.py +++ b/integration_test/test_snapshot_complex_coupling.py @@ -23,29 +23,28 @@ def cache_component(max_channels=2): cache_t = float('-inf') cache_data = [] max_cache_age = None + nil_msg = Message(0.0, None, None) + while instance.reuse_instance(): cache_valid_range = instance.get_setting('cache_valid', '[float]') if max_cache_age is None: max_cache_age = random.uniform(*cache_valid_range) - msgs = [instance.receive(port) if instance.is_connected(port) else None + msgs = [instance.receive(port, default=nil_msg) for port in ports[Operator.F_INIT]] cur_t = msgs[0].timestamp if cur_t - cache_t >= max_cache_age: # Cached value is no longer valid, run submodel for updated data for msg, port in zip(msgs, ports[Operator.O_I]): - if msg is not None: - instance.send(port, Message(cur_t, None, msg.data)) - cache_data = [instance.receive(port).data - if instance.is_connected(port) else None + instance.send(port, Message(cur_t, None, msg.data)) + cache_data = [instance.receive(port, default=nil_msg).data for port in ports[Operator.S]] cache_t = cur_t max_cache_age = random.uniform(*cache_valid_range) for data, port in zip(cache_data, ports[Operator.O_F]): - if data is not None: - instance.send(port, Message(cur_t, None, data)) + instance.send(port, Message(cur_t, None, data)) if instance.should_save_final_snapshot(): instance.save_final_snapshot(Message(cur_t, None, [])) From bca4d83fb91d5ef1e14acbfa06b0f0bbf7d5e2f7 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Mon, 2 Jan 2023 15:09:03 +0100 Subject: [PATCH 116/183] Use default values to simplify member functions --- libmuscle/cpp/src/libmuscle/port.cpp | 59 ++++---------------------- libmuscle/cpp/src/libmuscle/port.hpp | 62 ++-------------------------- 2 files changed, 13 insertions(+), 108 deletions(-) diff --git a/libmuscle/cpp/src/libmuscle/port.cpp b/libmuscle/cpp/src/libmuscle/port.cpp index 70db0550..1dd32a44 100644 --- a/libmuscle/cpp/src/libmuscle/port.cpp +++ b/libmuscle/cpp/src/libmuscle/port.cpp @@ -137,66 +137,25 @@ const std::vector & Port::get_message_counts() const { return num_messages_; } -void Port::increment_num_messages() { - num_messages_[0] ++; - set_resumed(); -} - -void Port::increment_num_messages(int slot) { - num_messages_[slot] ++; - set_resumed(slot); -} - void Port::increment_num_messages(Optional slot) { - if(slot.is_set()) - increment_num_messages(slot.get()); - else - increment_num_messages(); -} - -int Port::get_num_messages() const { - return num_messages_[0]; -} - -int Port::get_num_messages(int slot) const { - return num_messages_[slot]; + int s = slot.is_set() ? slot.get() : 0; + num_messages_[s] ++; + set_resumed(s); } int Port::get_num_messages(Optional slot) const { - if(slot.is_set()) - return get_num_messages(slot.get()); - else - return get_num_messages(); -} - -bool Port::is_resuming() const { - return is_resuming_[0]; -} - -bool Port::is_resuming(int slot) const { - return is_resuming_[slot]; + int s = slot.is_set() ? slot.get() : 0; + return num_messages_[s]; } bool Port::is_resuming(Optional slot) const { - if(slot.is_set()) - return is_resuming(slot.get()); - else - return is_resuming(); -} - -void Port::set_resumed() { - is_resuming_[0] = false; -} - -void Port::set_resumed(int slot) { - is_resuming_[slot] = false; + int s = slot.is_set() ? slot.get() : 0; + return is_resuming_[s]; } void Port::set_resumed(Optional slot) { - if(slot.is_set()) - set_resumed(slot.get()); - else - set_resumed(); + int s = slot.is_set() ? slot.get() : 0; + is_resuming_[s] = false; } } } diff --git a/libmuscle/cpp/src/libmuscle/port.hpp b/libmuscle/cpp/src/libmuscle/port.hpp index 18cfb5d9..908fb270 100644 --- a/libmuscle/cpp/src/libmuscle/port.hpp +++ b/libmuscle/cpp/src/libmuscle/port.hpp @@ -118,59 +118,17 @@ class Port : public ::ymmsl::Port { */ const std::vector & get_message_counts() const; - /** Increment amount of messages sent or received. - */ - void increment_num_messages(); - - /** Increment amount of messages sent or received. - * - * Only valid for vector ports. - * - * @param slot The slot that is sent/received on - */ - void increment_num_messages(int slot); - /** Increment amount of messages sent or received. * * @param slot The slot that is sent/received on */ - void increment_num_messages(Optional slot); + void increment_num_messages(Optional slot = {}); /** Get the amount of messages sent or received - */ - int get_num_messages() const; - - /** Get the amount of messages sent or received - * - * Only valid for vector ports. - * - * @param slot The slot that is sent/received on - */ - int get_num_messages(int slot) const; - - /** Get the amount of messages sent or received - * - * @param slot The slot that is sent/received on - */ - int get_num_messages(Optional slot) const; - - /** True when this port has resumed. - * - * After resumption, each port/slot may discard exactly one message. - * is_resuming keeps track of this state. - */ - bool is_resuming() const; - - /** True when this port has resumed. - * - * After resumption, each port/slot may discard exactly one message. - * is_resuming keeps track of this state. - * - * Only valid for vector ports. * * @param slot The slot that is sent/received on */ - bool is_resuming(int slot) const; + int get_num_messages(Optional slot = {}) const; /** True when this port has resumed. * @@ -179,25 +137,13 @@ class Port : public ::ymmsl::Port { * * @param slot The slot that is sent/received on */ - bool is_resuming(Optional slot) const; - - /** Mark that this port has resumed and may no longer discard messages. - */ - void set_resumed(); - - /** Mark that this port has resumed and may no longer discard messages. - * - * Only valid for vector ports. - * - * @param slot The slot that is sent/received on - */ - void set_resumed(int slot); + bool is_resuming(Optional slot = {}) const; /** Mark that this port has resumed and may no longer discard messages. * * @param slot The slot that is sent/received on */ - void set_resumed(Optional slot); + void set_resumed(Optional slot = {}); private: bool is_connected_; From db6d030589f899fd9748994a8f196eb9779eb175 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Tue, 3 Jan 2023 15:58:56 +0100 Subject: [PATCH 117/183] Factor out API checking --- .../test_snapshot_complex_coupling.py | 16 +- libmuscle/python/libmuscle/api_guard.py | 221 ++++++++++++++++++ .../python/libmuscle/checkpoint_triggers.py | 87 +------ libmuscle/python/libmuscle/instance.py | 50 ++-- .../python/libmuscle/snapshot_manager.py | 21 +- libmuscle/python/libmuscle/test/conftest.py | 6 + .../python/libmuscle/test/test_api_guard.py | 154 ++++++++++++ .../test/test_checkpoint_triggers.py | 46 +--- .../libmuscle/test/test_snapshot_manager.py | 12 +- 9 files changed, 447 insertions(+), 166 deletions(-) create mode 100644 libmuscle/python/libmuscle/api_guard.py create mode 100644 libmuscle/python/libmuscle/test/test_api_guard.py diff --git a/integration_test/test_snapshot_complex_coupling.py b/integration_test/test_snapshot_complex_coupling.py index 79b112d4..a75a89ce 100644 --- a/integration_test/test_snapshot_complex_coupling.py +++ b/integration_test/test_snapshot_complex_coupling.py @@ -26,13 +26,17 @@ def cache_component(max_channels=2): nil_msg = Message(0.0, None, None) while instance.reuse_instance(): - cache_valid_range = instance.get_setting('cache_valid', '[float]') - if max_cache_age is None: - max_cache_age = random.uniform(*cache_valid_range) + if instance.resuming(): + instance.load_snapshot() + + if instance.should_init(): + cache_valid_range = instance.get_setting('cache_valid', '[float]') + if max_cache_age is None: + max_cache_age = random.uniform(*cache_valid_range) - msgs = [instance.receive(port, default=nil_msg) - for port in ports[Operator.F_INIT]] - cur_t = msgs[0].timestamp + msgs = [instance.receive(port, default=nil_msg) + for port in ports[Operator.F_INIT]] + cur_t = msgs[0].timestamp if cur_t - cache_t >= max_cache_age: # Cached value is no longer valid, run submodel for updated data diff --git a/libmuscle/python/libmuscle/api_guard.py b/libmuscle/python/libmuscle/api_guard.py new file mode 100644 index 00000000..1f4fbfd3 --- /dev/null +++ b/libmuscle/python/libmuscle/api_guard.py @@ -0,0 +1,221 @@ +from enum import auto, Enum +from typing import Optional + + +class APIPhase(Enum): + """Different phases that the user code traverses. + + These values describe different regions that the model code can be + in for the case where checkpointing is implemented. By tracking + the phase that the model should be in, we can detect incorrect API + usage. + + This does not match the yMMSL operators, as it is more + fine-grained and concerns checkpointing, which is not represented + in the SEL. + + Note that AFTER_REUSE_INSTANCE and BEFORE_RESUMING refer to the + same place in the code. AFTER_REUSE_INSTANCE is used when we + don't know yet if the code has checkpointing support, and so we + don't know whether the next call is to resuming() or to + reuse_instance(). Once a checkpointing function has been called, + we know that we should expect resume() after reuse_instance() and + we use BEFORE_RESUMING accordingly. + """ + BEFORE_REUSE_INSTANCE = auto() + """Before calling reuse_instance""" + + AFTER_REUSE_INSTANCE = auto() + """At the top of the reuse loop""" + + BEFORE_RESUMING = auto() + """Between reuse_instance and resuming""" + + BEFORE_LOAD_SNAPSHOT = auto() + """Between resuming and load_snapshot""" + + BEFORE_SHOULD_INIT = auto() + """After resuming, before should_init""" + + BEFORE_SHOULD_SAVE_SNAPSHOT = auto() + """Between should_init and should_save*""" + + BEFORE_SAVE_SNAPSHOT = auto() + """Between should_save_snapshot and save_snapshot""" + + BEFORE_SAVE_FINAL_SNAPSHOT = auto() + """Between should_save_final_snapshot and save_final_snapshot""" + + AFTER_REUSE_LOOP = auto() + """After the final call to reuse_instance()""" + + +class APIGuard: + """Keeps track of and checks in which phase the model is. + + The verify_* functions are called when the corresponding function + on Instance is called, to check that we're in the right phase. They + raise a RuntimeError if there's a problem. The *_done functions are + called to signal that the corresponding function finished + successfully, and that we are moving on to the next phase. + """ + def __init__(self) -> None: + """Create an APIPhaseTracker. + + This starts the tracker in BEFORE_REUSE_INSTANCE. + """ + self._phase = APIPhase.BEFORE_REUSE_INSTANCE + self._uses_checkpointing = None # type: Optional[bool] + + def uses_checkpointing(self) -> bool: + """Return whether the code is using checkpointing. + + We can only determine that the code doesn't use checkpointing + if there are no checkpointing calls between the first and + second calls to reuse_instance. So this function should only + be called after the second call to verify_reuse_instance, or + it may raise if the code does not use checkpointing. + + Raises: + RuntimeError: if we are at a point where we cannot know + the answer yet. + """ + if self._uses_checkpointing is not None: + return self._uses_checkpointing + raise RuntimeError( + 'The API was implemented incorrectly, please consult the' + ' documentation.') + + def verify_reuse_instance(self) -> None: + """Check reuse_instance()""" + if self._phase == APIPhase.AFTER_REUSE_INSTANCE: + self._uses_checkpointing = False + elif self._phase != APIPhase.BEFORE_REUSE_INSTANCE: + raise RuntimeError() + + def reuse_instance_done(self, reusing: bool) -> None: + """Update phase on successful reuse_instance(). + + Args: + reusing: Whether we are reusing or not. + """ + if not reusing: + self._phase = APIPhase.AFTER_REUSE_LOOP + else: + if self._uses_checkpointing is None: + self._phase = APIPhase.AFTER_REUSE_INSTANCE + elif self._uses_checkpointing: + self._phase = APIPhase.BEFORE_RESUMING + else: + self._phase = APIPhase.BEFORE_REUSE_INSTANCE + + def verify_resuming(self) -> None: + """Check resuming()""" + if self._phase not in ( + APIPhase.BEFORE_RESUMING, APIPhase.AFTER_REUSE_INSTANCE): + raise RuntimeError( + 'Please call resuming() only as the first thing in the' + ' reuse loop.') + + def resuming_done(self, resuming: bool) -> None: + """Update phase on successful resuming(). + + Args: + resuming: Whether we're resuming or not. + """ + self._uses_checkpointing = True + if resuming: + self._phase = APIPhase.BEFORE_LOAD_SNAPSHOT + else: + self._phase = APIPhase.BEFORE_SHOULD_INIT + + def verify_load_snapshot(self) -> None: + """Check load_snapshot()""" + if self._phase != APIPhase.BEFORE_LOAD_SNAPSHOT: + raise RuntimeError( + 'Please check that we are resuming by calling resuming()' + ' before calling load_snapshot()') + + def load_snapshot_done(self) -> None: + """Update phase on successful load_snapshot()""" + self._phase = APIPhase.BEFORE_SHOULD_INIT + + def verify_should_init(self) -> None: + """Check should_init()""" + if self._phase != APIPhase.BEFORE_SHOULD_INIT: + raise RuntimeError( + 'Please check whether to run f_init using should_init()' + ' after resuming, and before trying to save a snapshot.') + + def should_init_done(self) -> None: + """Update phase on successful should_init()""" + self._phase = APIPhase.BEFORE_SHOULD_SAVE_SNAPSHOT + + def verify_should_save_snapshot(self) -> None: + """Check should_save_snapshot()""" + if self._phase != APIPhase.BEFORE_SHOULD_SAVE_SNAPSHOT: + raise RuntimeError( + 'We reached the end of the reuse loop without checking' + ' if a snapshot should be saved. Please add at least' + ' a should_save_final_snapshot and save_final_snapshot.') + + def should_save_snapshot_done(self, should_save: bool) -> None: + """Update phase on successful should_save_snapshot(). + + Args: + should_save: Whether we should save or not. + """ + if should_save: + self._phase = APIPhase.BEFORE_SAVE_SNAPSHOT + + def verify_save_snapshot(self) -> None: + """Check should_save_snapshot()""" + if self._phase != APIPhase.BEFORE_SAVE_SNAPSHOT: + raise RuntimeError() + + def save_snapshot_done(self) -> None: + """Update phase on successful save_snapshot()""" + self._phase = APIPhase.BEFORE_SHOULD_SAVE_SNAPSHOT + + def verify_should_save_final_snapshot(self) -> None: + """Check should_save_final_snapshot().""" + if self._phase != APIPhase.BEFORE_SHOULD_SAVE_SNAPSHOT: + if self._phase in ( + APIPhase.BEFORE_REUSE_INSTANCE, APIPhase.AFTER_REUSE_LOOP): + msg = ( + 'Please only call should_save_final_snapshot inside' + ' the reuse loop.') + elif self._phase == APIPhase.BEFORE_SAVE_FINAL_SNAPSHOT: + msg = ( + 'If should_save_final_snapshot returns True, then you' + ' must call save_final_snapshot immediately.') + elif self._phase == APIPhase.BEFORE_SAVE_SNAPSHOT: + msg = ( + 'If should_save_snapshot returns True, then you must' + ' call save_snapshot first.') + else: + msg = ( + 'Please only call should_save_final_snapshot at the' + ' end of the reuse loop.') + + raise RuntimeError(msg) + + def should_save_final_snapshot_done(self, should_save: bool) -> None: + """Update phase on successful should_save_snapshot(). + + Args: + should_save: Whether we should save or not. + """ + if should_save: + self._phase = APIPhase.BEFORE_SAVE_FINAL_SNAPSHOT + else: + self._phase = APIPhase.BEFORE_REUSE_INSTANCE + + def verify_save_final_snapshot(self) -> None: + """Check should_save_final_snapshot()""" + if self._phase != APIPhase.BEFORE_SAVE_FINAL_SNAPSHOT: + raise RuntimeError() + + def save_final_snapshot_done(self) -> None: + """Updates state on successful save_final_snapshot()""" + self._phase = APIPhase.BEFORE_REUSE_INSTANCE diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py index a4edf3be..88a561f8 100644 --- a/libmuscle/python/libmuscle/checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/checkpoint_triggers.py @@ -1,7 +1,6 @@ import bisect from datetime import datetime, timezone import logging -import os import time from typing import List, Optional, Union @@ -12,13 +11,6 @@ _logger = logging.getLogger(__name__) -def _checkpoint_error(description: str) -> None: - if "MUSCLE_DISABLE_CHECKPOINT_ERRORS" in os.environ: - _logger.warning(f"Suppressed checkpoint error: {description}") - else: - raise RuntimeError(description) - - class CheckpointTrigger: """Represents a trigger for creating snapshots""" @@ -59,10 +51,7 @@ def __init__(self, at_rules: List[CheckpointAtRule]) -> None: Args: at: list of checkpoint moments """ - self._at = [] - for at_rule in at_rules: - self._at.extend(at_rule.at) - self._at.sort() + self._at = sorted([a for r in at_rules for a in r.at]) def next_checkpoint(self, cur_time: float) -> Optional[float]: if cur_time >= self._at[-1]: @@ -214,14 +203,6 @@ def set_checkpoint_info( self._nextsim = None # type: Optional[float] self._sim_reset = True - self._first_reuse = True - - # These attributes are only used to check if implementations are - # following the guidelines - self._should_have_saved = False - self._should_save_final_called = False - self._saved_final_checkpoint = False - def elapsed_walltime(self) -> float: """Returns elapsed wallclock_time in seconds. """ @@ -238,12 +219,8 @@ def should_save_snapshot(self, timestamp: float) -> bool: if not self._has_checkpoints: return False - self.__check_should_have_saved() - elapsed_walltime = self.elapsed_walltime() - value = self.__should_save(elapsed_walltime, timestamp) - self._should_have_saved = value - return value + return self.__should_save(elapsed_walltime, timestamp) def should_save_final_snapshot( self, do_reuse: bool, f_init_max_timestamp: Optional[float] @@ -253,8 +230,6 @@ def should_save_final_snapshot( if not self._has_checkpoints: return False - self.__check_should_have_saved() - value = False if not do_reuse and self._checkpoint_at_end: value = True @@ -269,56 +244,15 @@ def should_save_final_snapshot( elapsed_walltime = self.elapsed_walltime() value = self.__should_save(elapsed_walltime, f_init_max_timestamp) - self._should_have_saved = value - self._should_save_final_called = True return value - @property - def save_final_snapshot_called(self) -> bool: - """Check if :meth:`save_final_snapshot` was called during this - reuse loop. - """ - return self._saved_final_checkpoint - - def reuse_instance(self) -> None: - """Cleanup between instance reuse - """ - if not self._has_checkpoints: - return - if self._first_reuse: - self._first_reuse = False - else: - if self._should_have_saved: - _checkpoint_error('"should_save_snapshot" or ' - '"should_save_final_snapshot" returned' - ' positive but no snapshot was saved before' - ' exiting the reuse loop.') - if not (self._should_save_final_called or self._saved_final_checkpoint): - _checkpoint_error('You must call "should_save_final" exactly' - ' once in the reuse loop of an instance that' - ' supports checkpointing.') - self._should_save_final_called = False - self._saved_final_checkpoint = False - - def update_checkpoints(self, timestamp: float, final: bool) -> None: + def update_checkpoints(self, timestamp: float) -> None: """Update last and next checkpoint times when a snapshot is made. Args: timestamp: timestamp as reported by the instance (or from incoming - F_INIT messages when final=True). - final: True iff this is coming from a save_final_snapshot call. + F_INIT messages for save_final_snapshot). """ - if not self._has_checkpoints: - _logger.info('Saving a snapshot but no checkpoints requested by the' - ' workflow. Hint: use Instance.should_save_snapshot(),' - ' Instance.should_save_final_snapshot() or' - ' Instance.snapshots_enabled() to test if it is useful' - ' to save a snapshot.') - return - if final and self._saved_final_checkpoint: - raise RuntimeError( - 'You may only save a final snapshot once per reuse loop.') - self._prevwall = self.elapsed_walltime() self._nextwall = self._wall.next_checkpoint(self._prevwall) @@ -328,8 +262,6 @@ def update_checkpoints(self, timestamp: float, final: bool) -> None: # this method is also called during resume, after which we no longer # consider the simulation_time as reset self._sim_reset = False - self._should_have_saved = False - self._saved_final_checkpoint = final def get_triggers(self) -> List[str]: """Get trigger description(s) for the current reason for checkpointing. @@ -338,17 +270,6 @@ def get_triggers(self) -> List[str]: self._last_triggers = [] return triggers - def __check_should_have_saved(self) -> None: - """Check if a snapshot is saved when required.""" - if self._should_have_saved: - _checkpoint_error('"should_save_snapshot" or ' - '"should_save_final_snapshot" returned positive' - ' but no snapshot was saved before the next call' - ' to a should_save_ method.' - ' You must call the corresponding save_snapshot' - ' or save_final_snapshot method when should_save_' - ' returns True.') - def __should_save(self, walltime: float, simulation_time: float) -> bool: """Check if a checkpoint should be taken diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index 245775e8..b7959b86 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -9,6 +9,7 @@ from ymmsl import (Identifier, Operator, SettingValue, Port, Reference, Settings, ImplementationState) +from libmuscle.api_guard import APIGuard from libmuscle.communicator import Communicator, Message from libmuscle.settings_manager import SettingsManager from libmuscle.logging import LogLevel @@ -66,6 +67,9 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None, self.__set_up_logging() + self._api_guard = APIGuard() + """Checks that the user uses the API correctly.""" + self._profiler = Profiler(self._instance_name(), self.__manager) """Profiler for this instance.""" @@ -143,6 +147,7 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool: :meth:`should_save_final_snapshot` and :meth:`save_final_snapshot`, or the checkpointing tutorial. """ + self._api_guard.verify_reuse_instance() do_reuse = self._do_reuse if do_reuse is None: # should_save_final_snapshot not called, so we need to check_reuse @@ -158,6 +163,7 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool: self._deregister() self.__manager.close() + self._api_guard.reuse_instance_done(do_reuse) return do_reuse def error_shutdown(self, message: str) -> None: @@ -431,7 +437,10 @@ def resuming(self) -> bool: True iff the submodel must resume from a snapshot instead of the usual F_INIT step during this iteration of the reuse loop. """ - return self._snapshot_manager.resuming() + self._api_guard.verify_resuming() + result = self._snapshot_manager.resuming() + self._api_guard.resuming_done(result) + return result def should_init(self) -> bool: """Check if this instance should initialize. @@ -445,7 +454,10 @@ def should_init(self) -> bool: Returns: True if the submodel must execute the F_INIT step, False otherwise. """ - return self._snapshot_manager.should_init() + self._api_guard.verify_should_init() + result = self._snapshot_manager.should_init() + self._api_guard.should_init_done() + return result def load_snapshot(self) -> Message: """Load a snapshot. @@ -459,7 +471,10 @@ def load_snapshot(self) -> Message: Raises: RuntimeError: if not resuming from a snapshot. """ - return self._snapshot_manager.load_snapshot() + self._api_guard.verify_load_snapshot() + result = self._snapshot_manager.load_snapshot() + self._api_guard.load_snapshot_done() + return result def should_save_snapshot(self, timestamp: float) -> bool: """Check if a snapshot should be saved after the S Operator of the @@ -482,7 +497,10 @@ def should_save_snapshot(self, timestamp: float) -> bool: True iff a snapshot should be taken by the submodel according to the checkpoint rules provided in the ymmsl configuration. """ - return self._snapshot_manager.should_save_snapshot(timestamp) + self._api_guard.verify_should_save_snapshot() + result = self._snapshot_manager.should_save_snapshot(timestamp) + self._api_guard.should_save_snapshot_done(result) + return result def save_snapshot(self, message: Message) -> None: """Save a snapshot after the S Operator of the submodel. @@ -508,7 +526,9 @@ def save_snapshot(self, message: Message) -> None: :meth:`should_save_snapshot`. The data attribute can be used to store the internal state of the submodel. """ - return self._snapshot_manager.save_snapshot(message) + self._api_guard.verify_save_snapshot() + self._snapshot_manager.save_snapshot(message) + self._api_guard.save_snapshot_done() def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool: """Check if a snapshot should be saved at the end of the reuse loop. @@ -542,13 +562,12 @@ def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool: True iff a final snapshot should be taken by the submodel according to the checkpoint rules provided in the ymmsl configuration. """ - if self._do_reuse is not None: - raise RuntimeError( - 'You may not call should_save_final_snapshot more than once' - ' per reuse loop.') + self._api_guard.verify_should_save_final_snapshot() self._do_reuse = self.__check_reuse_instance(apply_overlay) - return self._snapshot_manager.should_save_final_snapshot( + result = self._snapshot_manager.should_save_final_snapshot( self._do_reuse, self.__f_init_max_timestamp) + self._api_guard.should_save_final_snapshot_done(result) + return result def save_final_snapshot(self, message: Message) -> None: """Save a snapshot at the end of the reuse loop. @@ -571,8 +590,10 @@ def save_final_snapshot(self, message: Message) -> None: attribute can be used to store the internal state of the submodel. """ - return self._snapshot_manager.save_final_snapshot( + self._api_guard.verify_save_final_snapshot() + self._snapshot_manager.save_final_snapshot( message, self.__f_init_max_timestamp) + self._api_guard.save_final_snapshot_done() @property def __f_init_max_timestamp(self) -> Optional[float]: @@ -669,7 +690,7 @@ def __check_reuse_instance(self, apply_overlay: bool) -> bool: # TODO: _f_init_cache should be empty here, or the user didn't # receive something that was sent on the last go-around. # At least emit a warning. - if self.should_init() or not self._first_run: + if self._snapshot_manager.should_init() or not self._first_run: # self.should_init() might be False in first should_save_final(), # but self._first_run is already updated by then self.__pre_receive_f_init(apply_overlay) @@ -684,8 +705,9 @@ def __check_reuse_instance(self, apply_overlay: bool) -> bool: no_settings_in = not self._communicator.settings_in_connected() if f_init_not_connected and no_settings_in: - do_reuse = self._first_run and (not self.resuming() or - not self.should_init()) + do_reuse = self._first_run and ( + not self._snapshot_manager.resuming() or + not self._snapshot_manager.should_init()) else: for message in self._f_init_cache.values(): if isinstance(message.data, ClosePort): diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py index 0bd3de83..efd869f9 100644 --- a/libmuscle/python/libmuscle/snapshot_manager.py +++ b/libmuscle/python/libmuscle/snapshot_manager.py @@ -52,6 +52,8 @@ def __init__(self, self._resume_from_snapshot = None # type: Optional[Snapshot] self._next_snapshot_num = 1 + self._should_save_final_called = False + def get_checkpoint_info(self) -> None: """Request checkpoint info from the muscle manager. """ @@ -78,8 +80,8 @@ def _set_checkpoint_info(self, # snapshot.message is None for implicit snapshots self._resume_from_snapshot = snapshot self._trigger_manager.update_checkpoints( - snapshot.message.timestamp, - snapshot.is_final_snapshot) + snapshot.message.timestamp) + self._should_save_final_called = snapshot.is_final_snapshot self._communicator.restore_message_counts( snapshot.port_message_counts) # Store a copy of the snapshot in the current run directory @@ -103,19 +105,19 @@ def reuse_instance(self, # Only create implicit snapshot if not already explicitly done # And not in the first reuse_instance() if (self._stateful is not ImplementationState.STATEFUL and - not self._trigger_manager.save_final_snapshot_called and + not self._should_save_final_called and not self._first_reuse): if self.should_save_final_snapshot(do_reuse, f_init_max_timestamp): # create an empty message object to store self.__save_snapshot(None, True, f_init_max_timestamp) - self._trigger_manager.reuse_instance() - if self._first_reuse: self._first_reuse = False else: self._resume_from_snapshot = None + self._should_save_final_called = False + def snapshots_enabled(self) -> bool: """Check if the current workflow has snapshots enabled. """ @@ -140,10 +142,8 @@ def should_init(self) -> bool: def load_snapshot(self) -> Message: """Get the Message to resume from. """ - if self._resume_from_snapshot is None: - raise RuntimeError('No snapshot to load. Use "instance.resuming()"' - ' to check if a snapshot is available') - return cast(Message, self._resume_from_snapshot.message) + snapshot = cast(Snapshot, self._resume_from_snapshot) + return cast(Message, snapshot.message) def should_save_snapshot(self, timestamp: float) -> bool: """See :meth:`TriggerManager.should_save_snapshot`. @@ -155,6 +155,7 @@ def should_save_final_snapshot( ) -> bool: """See :meth:`TriggerManager.should_save_final_snapshot`. """ + self._should_save_final_called = True return self._trigger_manager.should_save_final_snapshot( do_reuse, f_init_max_timestamp) @@ -211,7 +212,7 @@ def __save_snapshot( # For final snapshots f_init_max_snapshot is the reference time (see # should_save_final_snapshot). timestamp = f_init_max_timestamp - self._trigger_manager.update_checkpoints(timestamp, final) + self._trigger_manager.update_checkpoints(timestamp) @staticmethod def load_snapshot_from_file(snapshot_location: Path) -> Snapshot: diff --git a/libmuscle/python/libmuscle/test/conftest.py b/libmuscle/python/libmuscle/test/conftest.py index a8d0ad72..77422ee9 100644 --- a/libmuscle/python/libmuscle/test/conftest.py +++ b/libmuscle/python/libmuscle/test/conftest.py @@ -3,6 +3,7 @@ from ymmsl import Settings +from libmuscle.api_guard import APIGuard from libmuscle.communicator import Message from libmuscle.mmp_client import MMPClient @@ -21,3 +22,8 @@ def message() -> Message: @pytest.fixture def message2() -> Message: return Message(0.0, None, {'test': 17}, Settings()) + + +@pytest.fixture +def guard() -> APIGuard: + return APIGuard() diff --git a/libmuscle/python/libmuscle/test/test_api_guard.py b/libmuscle/python/libmuscle/test/test_api_guard.py new file mode 100644 index 00000000..f67bde93 --- /dev/null +++ b/libmuscle/python/libmuscle/test/test_api_guard.py @@ -0,0 +1,154 @@ +from typing import Callable, Set + +import pytest + +from libmuscle.api_guard import APIGuard + + +def test_no_checkpointing_support(guard): + for _ in range(3): + guard.verify_reuse_instance() + guard.reuse_instance_done(True) + + assert not guard.uses_checkpointing() + + guard.verify_reuse_instance() + guard.reuse_instance_done(False) + + assert not guard.uses_checkpointing() + + +def test_final_snapshot_only(guard): + for i in range(4): + guard.verify_reuse_instance() + guard.reuse_instance_done(True) + + guard.verify_resuming() + if i == 0: + guard.resuming_done(True) + + guard.verify_load_snapshot() + guard.load_snapshot_done() + else: + guard.resuming_done(False) + + guard.verify_should_init() + guard.should_init_done() + + guard.verify_should_save_final_snapshot() + if i == 2: + guard.should_save_final_snapshot_done(True) + + guard.verify_save_final_snapshot() + guard.save_final_snapshot_done() + else: + guard.should_save_final_snapshot_done(False) + + guard.verify_reuse_instance() + guard.reuse_instance_done(False) + + +def test_full_checkpointing(guard): + for i in range(4): + guard.verify_reuse_instance() + guard.reuse_instance_done(True) + + guard.verify_resuming() + if i == 0: + guard.resuming_done(True) + + guard.verify_load_snapshot() + guard.load_snapshot_done() + else: + guard.resuming_done(False) + + guard.verify_should_init() + guard.should_init_done() + + for j in range(3): + guard.verify_should_save_snapshot() + if j != 2: + guard.should_save_snapshot_done(True) + + guard.verify_save_snapshot() + guard.save_snapshot_done() + else: + guard.should_save_snapshot_done(False) + + guard.verify_should_save_final_snapshot() + if i == 2: + guard.should_save_final_snapshot_done(True) + + guard.verify_save_final_snapshot() + guard.save_final_snapshot_done() + else: + guard.should_save_final_snapshot_done(False) + + guard.verify_reuse_instance() + guard.reuse_instance_done(False) + + +_api_guard_funs = ( + (APIGuard.verify_reuse_instance, ()), + (APIGuard.reuse_instance_done, (True,)), + (APIGuard.verify_resuming, ()), + (APIGuard.resuming_done, (True,)), + (APIGuard.verify_load_snapshot, ()), + (APIGuard.load_snapshot_done, ()), + (APIGuard.verify_should_init, ()), + (APIGuard.should_init_done, ()), + (APIGuard.verify_should_save_snapshot, ()), + (APIGuard.should_save_snapshot_done, (True,)), + (APIGuard.verify_save_snapshot, ()), + (APIGuard.save_snapshot_done, ()), + (APIGuard.verify_should_save_final_snapshot, ()), + (APIGuard.should_save_final_snapshot_done, (True,)), + (APIGuard.verify_save_final_snapshot, ()) +) + + +def run_until_before(guard: APIGuard, excluded: Callable) -> None: + for fun, args in _api_guard_funs: + if fun is excluded: + break + fun(guard, *args) + + +def check_all_raise_except(guard: APIGuard, excluded: Set[Callable]) -> None: + for fun, args in _api_guard_funs: + if fun.__name__.startswith('verify_'): + if fun not in excluded: + with pytest.raises(RuntimeError): + fun(guard, *args) + else: + fun(guard, *args) + + +@pytest.mark.parametrize('fun', [ + APIGuard.verify_load_snapshot, + APIGuard.verify_should_init, APIGuard.verify_save_snapshot, + APIGuard.verify_save_final_snapshot]) +def test_missing_step(guard, fun): + run_until_before(guard, fun) + check_all_raise_except(guard, {fun}) + + +def test_missing_resuming(guard): + run_until_before(guard, APIGuard.verify_resuming) + check_all_raise_except(guard, { + APIGuard.verify_resuming, APIGuard.verify_reuse_instance}) + + +def test_missing_should_save_final(guard): + run_until_before(guard, APIGuard.verify_should_save_final_snapshot) + check_all_raise_except(guard, { + APIGuard.verify_should_save_snapshot, + APIGuard.verify_should_save_final_snapshot}) + + +def test_double_should_save(guard): + run_until_before(guard, APIGuard.verify_should_save_snapshot) + guard.verify_should_save_snapshot() + guard.should_save_snapshot_done(True) + with pytest.raises(RuntimeError): + guard.verify_should_save_snapshot() diff --git a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py index 0cbf47b2..8200854f 100644 --- a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py @@ -1,5 +1,4 @@ from datetime import datetime, timedelta, timezone -import logging import time import pytest from ymmsl import CheckpointRangeRule, CheckpointAtRule, Checkpoints @@ -156,15 +155,11 @@ def test_trigger_manager(): wallclock_time=[CheckpointAtRule([1e-12])], simulation_time=[CheckpointAtRule([1, 3, 5])])) - trigger_manager.reuse_instance() - assert trigger_manager.should_save_snapshot(0.1) triggers = trigger_manager.get_triggers() assert len(triggers) == 1 assert "wallclock_time" in triggers[0] - with pytest.raises(RuntimeError): # did not call save in between - trigger_manager.should_save_snapshot(0.1) - trigger_manager.update_checkpoints(0.1, False) + trigger_manager.update_checkpoints(0.1) assert not trigger_manager.should_save_snapshot(0.99) @@ -172,46 +167,13 @@ def test_trigger_manager(): triggers = trigger_manager.get_triggers() assert len(triggers) == 1 assert "simulation_time" in triggers[0] - trigger_manager.update_checkpoints(3.2, False) + trigger_manager.update_checkpoints(3.2) assert trigger_manager.should_save_final_snapshot(True, 7.0) - with pytest.raises(RuntimeError): # did not call save in between - trigger_manager.should_save_snapshot(4.0) - with pytest.raises(RuntimeError): # did not call save in between - trigger_manager.should_save_final_snapshot(True, 7.0) assert len(trigger_manager.get_triggers()) > 0 - trigger_manager.update_checkpoints(7.0, True) - - trigger_manager.reuse_instance() + trigger_manager.update_checkpoints(7.0) assert not trigger_manager.should_save_snapshot(7.1) - with pytest.raises(RuntimeError): # no should_save_final called - trigger_manager.reuse_instance() assert trigger_manager.should_save_final_snapshot(False, None) - with pytest.raises(RuntimeError): # not saved - trigger_manager.reuse_instance() - trigger_manager.update_checkpoints(7.1, True) - - trigger_manager.reuse_instance() - - -def test_trigger_manager_warnings(caplog: pytest.LogCaptureFixture, - monkeypatch: pytest.MonkeyPatch): - monkeypatch.setenv("MUSCLE_DISABLE_CHECKPOINT_ERRORS", "1") - - reference = datetime.now(timezone.utc) - trigger_manager = TriggerManager() - trigger_manager.set_checkpoint_info(reference, Checkpoints( - simulation_time=[CheckpointAtRule([1, 3, 5])])) - - trigger_manager.reuse_instance() - - with caplog.at_level(logging.WARN): - n_records = len(caplog.records) - assert trigger_manager.should_save_snapshot(1.5) - assert len(caplog.records) == n_records - - trigger_manager.reuse_instance() # suppressed error - assert len(caplog.records) > n_records - assert "Suppressed checkpoint error" in caplog.records[-1].message + trigger_manager.update_checkpoints(7.1) diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py index ffec4744..b4121dac 100644 --- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py +++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py @@ -1,9 +1,7 @@ from datetime import datetime, timezone -import logging from pathlib import Path from unittest.mock import MagicMock -import pytest from ymmsl import ( Reference, Checkpoints, CheckpointRangeRule, ImplementationState) @@ -12,8 +10,7 @@ from libmuscle.snapshot_manager import SnapshotManager -def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path - ) -> None: +def test_no_checkpointing(tmp_path: Path) -> None: manager = MagicMock() communicator = MagicMock() communicator.get_message_counts.return_value = {} @@ -31,11 +28,6 @@ def test_no_checkpointing(caplog: pytest.LogCaptureFixture, tmp_path: Path assert not snapshot_manager.should_save_snapshot(5000) assert not snapshot_manager.should_save_final_snapshot(False, None) - with caplog.at_level(logging.INFO, 'libmuscle'): - snapshot_manager.save_snapshot(Message(1.0, None, None)) - assert caplog.records[0].levelname == "INFO" - assert "no checkpoints" in caplog.records[0].message - def test_save_load_snapshot(tmp_path: Path) -> None: manager = MagicMock() @@ -53,8 +45,6 @@ def test_save_load_snapshot(tmp_path: Path) -> None: assert not snapshot_manager.resuming() snapshot_manager.reuse_instance(True, None) - with pytest.raises(RuntimeError): - snapshot_manager.load_snapshot() assert not snapshot_manager.resuming() assert snapshot_manager.should_save_snapshot(0.2) From c16a6cc5b57fb5bef24cb6817a8cc7db29792fe7 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Wed, 4 Jan 2023 10:13:25 +0100 Subject: [PATCH 118/183] Make only Instance decide how to run the reuse loop --- libmuscle/python/libmuscle/instance.py | 156 ++++++++++++------ .../python/libmuscle/snapshot_manager.py | 75 +++------ .../test/test_checkpoint_triggers.py | 10 ++ .../libmuscle/test/test_snapshot_manager.py | 59 +++---- 4 files changed, 163 insertions(+), 137 deletions(-) diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index b7959b86..247af77d 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -84,16 +84,27 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None, """Settings for this instance.""" self._snapshot_manager = SnapshotManager( - self._instance_name(), self.__manager, self._communicator, - self._stateful) + self._instance_name(), self.__manager, self._communicator) """Keeps track of checkpointing and snapshots""" - self._first_run = True - """Keeps track of whether this is the first reuse run.""" + self._first_run = None # type: Optional[bool] + """Whether this is the first iteration of the reuse loop""" + self._do_reuse = None # type: Optional[bool] - """Caching variable for result from :meth:`__check_reuse_instance`""" + """Whether to enter this iteration of the reuse loop + + This is None during the reuse loop, and set between + should_save_final_snapshot and reuse_instance. + """ + + self._do_resume = False + """Whether to resume on this iteration of the reuse loop""" + + self._do_init = False + """Whether to do f_init on this iteration of the reuse loop""" self._f_init_cache = dict() # type: _FInitCacheType + """Stores pre-received messages for f_init ports""" self._register() self._connect() @@ -148,14 +159,27 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool: :meth:`save_final_snapshot`, or the checkpointing tutorial. """ self._api_guard.verify_reuse_instance() - do_reuse = self._do_reuse - if do_reuse is None: - # should_save_final_snapshot not called, so we need to check_reuse - do_reuse = self.__check_reuse_instance(apply_overlay) - self._do_reuse = None - self._snapshot_manager.reuse_instance( - do_reuse, self.__f_init_max_timestamp) + if self._do_reuse is not None: + # thank you, should_save_final_snapshot, for running this already + do_reuse = self._do_reuse + self._do_reuse = None + else: + do_reuse = self._decide_reuse_instance(apply_overlay) + + # now _first_run, _do_resume and _do_init are also set correctly + + do_implicit_checkpoint = ( + not self._first_run and + not self._api_guard.uses_checkpointing() and + self._stateful is not ImplementationState.STATEFUL) + + if do_implicit_checkpoint: + if self._snapshot_manager.should_save_final_snapshot( + do_reuse, self.__f_init_max_timestamp): + # store a None instead of a Message + self._snapshot_manager.save_implicit_snapshot( + self.__f_init_max_timestamp) if not do_reuse: self.__close_ports() @@ -438,9 +462,8 @@ def resuming(self) -> bool: usual F_INIT step during this iteration of the reuse loop. """ self._api_guard.verify_resuming() - result = self._snapshot_manager.resuming() - self._api_guard.resuming_done(result) - return result + self._api_guard.resuming_done(self._do_resume) + return self._do_resume def should_init(self) -> bool: """Check if this instance should initialize. @@ -455,9 +478,8 @@ def should_init(self) -> bool: True if the submodel must execute the F_INIT step, False otherwise. """ self._api_guard.verify_should_init() - result = self._snapshot_manager.should_init() self._api_guard.should_init_done() - return result + return self._do_init def load_snapshot(self) -> Message: """Load a snapshot. @@ -563,7 +585,7 @@ def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool: to the checkpoint rules provided in the ymmsl configuration. """ self._api_guard.verify_should_save_final_snapshot() - self._do_reuse = self.__check_reuse_instance(apply_overlay) + self._do_reuse = self._decide_reuse_instance(apply_overlay) result = self._snapshot_manager.should_save_final_snapshot( self._do_reuse, self.__f_init_max_timestamp) self._api_guard.should_save_final_snapshot_done(result) @@ -678,43 +700,51 @@ def __set_up_logging(self) -> None: self.__manager) logging.getLogger().addHandler(self._mmp_handler) - def __check_reuse_instance(self, apply_overlay: bool) -> bool: - """Pre-receive F_INIT messages and detect if this instance is reused. + def _decide_reuse_instance(self, apply_overlay: bool) -> bool: + """Decide whether and how to reuse the instance. - This is called during :meth:`should_save_final_snapshot` to detect if a - snapshot must be taken. If an instance doesn't implement checkpointing, - :meth:`reuse_instance` will call it instead. + This sets self._first_run, self._do_resume and self._do_init, and + returns whether to reuse one more time. This is the real top of + the reuse loop, and it gets called by reuse_instance and + should_save_final_snapshot. """ - do_reuse = self.__receive_settings() - - # TODO: _f_init_cache should be empty here, or the user didn't - # receive something that was sent on the last go-around. - # At least emit a warning. - if self._snapshot_manager.should_init() or not self._first_run: - # self.should_init() might be False in first should_save_final(), - # but self._first_run is already updated by then - self.__pre_receive_f_init(apply_overlay) - - self._set_local_log_level() - self._set_remote_log_level() + if self._first_run is None: + self._first_run = True + elif self._first_run: + self._first_run = False + + # resume from intermediate + if self._first_run and self._snapshot_manager.resuming_from_intermediate(): + self._do_resume = True + self._do_init = False + return True + + f_init_connected = self._have_f_init_connections() + + # resume from final + if self._first_run and self._snapshot_manager.resuming_from_final(): + if f_init_connected: + got_f_init_messages = self._pre_receive(apply_overlay) + self._do_resume = True + self._do_init = True + return got_f_init_messages + else: + self._do_resume = False # unused + self._do_init = False # unused + return False - ports = self._communicator.list_ports() - f_init_not_connected = all( - [not self.is_connected(port) - for port in ports.get(Operator.F_INIT, [])]) - no_settings_in = not self._communicator.settings_in_connected() + # fresh start or resuming from implicit snapshot + self._do_resume = False - if f_init_not_connected and no_settings_in: - do_reuse = self._first_run and ( - not self._snapshot_manager.resuming() or - not self._snapshot_manager.should_init()) - else: - for message in self._f_init_cache.values(): - if isinstance(message.data, ClosePort): - do_reuse = False - self._first_run = False + # simple straight single run without resuming + if not f_init_connected: + self._do_init = self._first_run + return self._first_run - return do_reuse + # not resuming and f_init connected, run while we get messages + got_f_init_messages = self._pre_receive(apply_overlay) + self._do_init = got_f_init_messages + return got_f_init_messages def __receive_message( self, port_name: str, slot: Optional[int], @@ -844,6 +874,32 @@ def __check_port(self, port_name: str) -> None: self.__shutdown(err_msg) raise RuntimeError(err_msg) + def _have_f_init_connections(self) -> bool: + """Checks whether we have connected F_INIT ports. + + This includes muscle_settings_in, and any user-defined ports. + """ + ports = self._communicator.list_ports() + f_init_connected = any( + [self.is_connected(port) + for port in ports.get(Operator.F_INIT, [])]) + return f_init_connected or self._communicator.settings_in_connected() + + def _pre_receive(self, apply_overlay: bool) -> bool: + """Pre-receives on all ports. + + This includes muscle_settings_in and all user-defined ports. + + Returns: + True iff no ClosePort messages were received. + """ + all_ports_open = self.__receive_settings() + self.__pre_receive_f_init(apply_overlay) + for message in self._f_init_cache.values(): + if isinstance(message.data, ClosePort): + all_ports_open = False + return all_ports_open + def __receive_settings(self) -> bool: """Receives settings on muscle_settings_in. diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py index efd869f9..c42a24c7 100644 --- a/libmuscle/python/libmuscle/snapshot_manager.py +++ b/libmuscle/python/libmuscle/snapshot_manager.py @@ -3,7 +3,7 @@ from pathlib import Path from typing import cast, Optional -from ymmsl import Checkpoints, Reference, Operator, ImplementationState +from ymmsl import Checkpoints, Reference, Operator from libmuscle.checkpoint_triggers import TriggerManager from libmuscle.communicator import Communicator, Message @@ -30,8 +30,7 @@ class SnapshotManager: def __init__(self, instance_id: Reference, manager: MMPClient, - communicator: Communicator, - stateful: ImplementationState) -> None: + communicator: Communicator) -> None: """Create a new snapshot manager Args: @@ -45,15 +44,11 @@ def __init__(self, self._safe_id = str(instance_id).replace("[", "-").replace("]", "") self._communicator = communicator self._manager = manager - self._stateful = stateful - self._first_reuse = True self._trigger_manager = TriggerManager() self._resume_from_snapshot = None # type: Optional[Snapshot] self._next_snapshot_num = 1 - self._should_save_final_called = False - def get_checkpoint_info(self) -> None: """Request checkpoint info from the muscle manager. """ @@ -81,7 +76,6 @@ def _set_checkpoint_info(self, self._resume_from_snapshot = snapshot self._trigger_manager.update_checkpoints( snapshot.message.timestamp) - self._should_save_final_called = snapshot.is_final_snapshot self._communicator.restore_message_counts( snapshot.port_message_counts) # Store a copy of the snapshot in the current run directory @@ -89,54 +83,27 @@ def _set_checkpoint_info(self, metadata = SnapshotMetadata.from_snapshot(snapshot, str(path)) self._manager.submit_snapshot_metadata(self._instance_id, metadata) - def reuse_instance(self, - do_reuse: bool, f_init_max_timestamp: Optional[float] - ) -> None: - """Callback on Instance.reuse_instance - - Args: - snapshot_directory: Path to store this instance's snapshots in. - do_reuse: Used for implicit snapshots of stateless instances. See - :meth:`should_save_final_snapshot`. - f_init_max_timestamp: Used for implicit snapshots of stateless - instances. See :meth:`should_save_final_snapshot`. - """ - # Implicit snapshots for stateless / weakly stateful instances - # Only create implicit snapshot if not already explicitly done - # And not in the first reuse_instance() - if (self._stateful is not ImplementationState.STATEFUL and - not self._should_save_final_called and - not self._first_reuse): - if self.should_save_final_snapshot(do_reuse, f_init_max_timestamp): - # create an empty message object to store - self.__save_snapshot(None, True, f_init_max_timestamp) - - if self._first_reuse: - self._first_reuse = False - else: - self._resume_from_snapshot = None - - self._should_save_final_called = False - def snapshots_enabled(self) -> bool: """Check if the current workflow has snapshots enabled. """ return self._trigger_manager.snapshots_enabled() - def resuming(self) -> bool: - """Check if we are resuming during this reuse iteration. + def resuming_from_intermediate(self) -> bool: + """Check whether we have an intermediate snapshot. + Doesn't say whether we should resume now, just that we were + given an intermediate snapshot to resume from by the manager. """ - return self._resume_from_snapshot is not None - - def should_init(self) -> bool: - """Check if F_INIT should be run in this reuse loop. - - Returns: - True: when not resuming this reuse loop, or when resuming from a - final snapshot. - False: otherwise - """ - return (self._resume_from_snapshot is None or + return ( + self._resume_from_snapshot is not None and + not self._resume_from_snapshot.is_final_snapshot) + + def resuming_from_final(self) -> bool: + """Check whether we have a final snapshot. + Doesn't say whether we should resume now, just that we were + given an intermediate snapshot to resume from by the manager. + """ + return ( + self._resume_from_snapshot is not None and self._resume_from_snapshot.is_final_snapshot) def load_snapshot(self) -> Message: @@ -155,7 +122,6 @@ def should_save_final_snapshot( ) -> bool: """See :meth:`TriggerManager.should_save_final_snapshot`. """ - self._should_save_final_called = True return self._trigger_manager.should_save_final_snapshot( do_reuse, f_init_max_timestamp) @@ -174,6 +140,12 @@ def save_final_snapshot( raise ValueError(_NO_MESSAGE_PROVIDED.format('save_final_snapshot')) self.__save_snapshot(msg, True, f_init_max_timestamp) + def save_implicit_snapshot( + self, f_init_max_timestamp: Optional[float]) -> None: + """Save final snapshot without a message. + """ + self.__save_snapshot(None, True, f_init_max_timestamp) + def __save_snapshot( self, msg: Optional[Message], final: bool, f_init_max_timestamp: Optional[float] = None @@ -183,6 +155,7 @@ def __save_snapshot( Args: msg: Message object representing the snapshot. final: True iff called from save_final_snapshot. + f_init_max_timestamp: Timestamp for final snapshots. """ triggers = self._trigger_manager.get_triggers() wallclock_time = self._trigger_manager.elapsed_walltime() diff --git a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py index 8200854f..388e6eca 100644 --- a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py @@ -177,3 +177,13 @@ def test_trigger_manager(): assert trigger_manager.should_save_final_snapshot(False, None) trigger_manager.update_checkpoints(7.1) + + +def test_no_checkpointing() -> None: + trigger_manager = TriggerManager() + trigger_manager.set_checkpoint_info( + datetime.now(timezone.utc), Checkpoints()) + + assert not trigger_manager.should_save_snapshot(1) + assert not trigger_manager.should_save_snapshot(5000) + assert not trigger_manager.should_save_final_snapshot(False, None) diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py index b4121dac..fa976e14 100644 --- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py +++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py @@ -2,8 +2,7 @@ from pathlib import Path from unittest.mock import MagicMock -from ymmsl import ( - Reference, Checkpoints, CheckpointRangeRule, ImplementationState) +from ymmsl import Reference, Checkpoints, CheckpointRangeRule from libmuscle.communicator import Message from libmuscle.snapshot import SnapshotMetadata @@ -14,19 +13,13 @@ def test_no_checkpointing(tmp_path: Path) -> None: manager = MagicMock() communicator = MagicMock() communicator.get_message_counts.return_value = {} - snapshot_manager = SnapshotManager( - Reference('test'), manager, communicator, - ImplementationState.STATEFUL) + snapshot_manager = SnapshotManager(Reference('test'), manager, communicator) snapshot_manager._set_checkpoint_info( datetime.now(timezone.utc), Checkpoints(), None, tmp_path) - assert not snapshot_manager.resuming() - snapshot_manager.reuse_instance(True, None) - assert not snapshot_manager.resuming() - assert not snapshot_manager.should_save_snapshot(1) - assert not snapshot_manager.should_save_snapshot(5000) - assert not snapshot_manager.should_save_final_snapshot(False, None) + assert not snapshot_manager.resuming_from_intermediate() + assert not snapshot_manager.resuming_from_final() def test_save_load_snapshot(tmp_path: Path) -> None: @@ -36,18 +29,15 @@ def test_save_load_snapshot(tmp_path: Path) -> None: communicator.get_message_counts.return_value = port_message_counts instance_id = Reference('test[1]') - snapshot_manager = SnapshotManager( - instance_id, manager, communicator, ImplementationState.STATEFUL) + snapshot_manager = SnapshotManager(instance_id, manager, communicator) checkpoints = Checkpoints(simulation_time=[CheckpointRangeRule(every=1)]) snapshot_manager._set_checkpoint_info( datetime.now(timezone.utc), checkpoints, None, tmp_path) - assert not snapshot_manager.resuming() - snapshot_manager.reuse_instance(True, None) - - assert not snapshot_manager.resuming() assert snapshot_manager.should_save_snapshot(0.2) + assert not snapshot_manager.resuming_from_intermediate() + assert not snapshot_manager.resuming_from_final() snapshot_manager.save_snapshot(Message(0.2, None, 'test data')) communicator.get_message_counts.assert_called_with() @@ -65,16 +55,14 @@ def test_save_load_snapshot(tmp_path: Path) -> None: assert snapshot_path.parent == tmp_path assert snapshot_path.name == 'test-1_1.pack' - snapshot_manager2 = SnapshotManager( - instance_id, manager, communicator, ImplementationState.STATEFUL) + snapshot_manager2 = SnapshotManager(instance_id, manager, communicator) snapshot_manager2._set_checkpoint_info( datetime.now(timezone.utc), checkpoints, snapshot_path, tmp_path) communicator.restore_message_counts.assert_called_with(port_message_counts) - assert snapshot_manager2.resuming() - snapshot_manager2.reuse_instance(True, None) - assert snapshot_manager2.resuming() + assert snapshot_manager2.resuming_from_intermediate() + assert not snapshot_manager2.resuming_from_final() msg = snapshot_manager2.load_snapshot() assert msg.timestamp == 0.2 assert msg.next_timestamp is None @@ -98,9 +86,11 @@ def test_save_load_snapshot(tmp_path: Path) -> None: assert snapshot_path.parent == tmp_path assert snapshot_path.name == 'test-1_3.pack' - assert snapshot_manager2.resuming() - snapshot_manager2.reuse_instance(True, None) - assert not snapshot_manager2.resuming() + assert snapshot_manager2.resuming_from_intermediate() + assert not snapshot_manager2.resuming_from_final() + snapshot_manager2.load_snapshot() + assert snapshot_manager2.resuming_from_intermediate() + assert not snapshot_manager2.resuming_from_final() def test_save_load_implicit_snapshot(tmp_path: Path) -> None: @@ -110,16 +100,15 @@ def test_save_load_implicit_snapshot(tmp_path: Path) -> None: communicator.get_message_counts.return_value = port_message_counts instance_id = Reference('test[1]') - snapshot_manager = SnapshotManager( - instance_id, manager, communicator, ImplementationState.STATELESS) + snapshot_manager = SnapshotManager(instance_id, manager, communicator) checkpoints = Checkpoints(simulation_time=[CheckpointRangeRule(every=1)]) snapshot_manager._set_checkpoint_info( datetime.now(timezone.utc), checkpoints, None, tmp_path) - assert not snapshot_manager.resuming() - snapshot_manager.reuse_instance(True, None) - snapshot_manager.reuse_instance(True, 1.5) + assert not snapshot_manager.resuming_from_intermediate() + assert not snapshot_manager.resuming_from_final() + snapshot_manager.save_implicit_snapshot(1.5) manager.submit_snapshot_metadata.assert_called_once() instance, metadata = manager.submit_snapshot_metadata.call_args[0] assert instance == instance_id @@ -127,8 +116,7 @@ def test_save_load_implicit_snapshot(tmp_path: Path) -> None: snapshot_path = Path(metadata.snapshot_filename) manager.submit_snapshot_metadata.reset_mock() - snapshot_manager2 = SnapshotManager( - instance_id, manager, communicator, ImplementationState.STATELESS) + snapshot_manager2 = SnapshotManager(instance_id, manager, communicator) snapshot_manager2._set_checkpoint_info( datetime.now(timezone.utc), checkpoints, snapshot_path, tmp_path) @@ -136,8 +124,7 @@ def test_save_load_implicit_snapshot(tmp_path: Path) -> None: manager.submit_snapshot_metadata.assert_called_once() manager.submit_snapshot_metadata.reset_mock() - assert not snapshot_manager2.resuming() - snapshot_manager2.reuse_instance(True, 1.5) - assert not snapshot_manager2.resuming() - snapshot_manager2.reuse_instance(True, 2.5) + assert not snapshot_manager2.resuming_from_intermediate() + assert not snapshot_manager2.resuming_from_final() + snapshot_manager2.save_implicit_snapshot(2.5) manager.submit_snapshot_metadata.assert_called_once() From 347eb284a4233ec8d85c3ee9ff4d9fb08b5b3e00 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Wed, 4 Jan 2023 18:43:25 +0100 Subject: [PATCH 119/183] Factor TriggerManager out of SnapshotManager --- libmuscle/python/libmuscle/instance.py | 61 +++++++--- .../python/libmuscle/snapshot_manager.py | 108 ++++++------------ .../libmuscle/test/test_snapshot_manager.py | 47 +++----- 3 files changed, 104 insertions(+), 112 deletions(-) diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index 247af77d..a94ccb76 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -10,6 +10,7 @@ Settings, ImplementationState) from libmuscle.api_guard import APIGuard +from libmuscle.checkpoint_triggers import TriggerManager from libmuscle.communicator import Communicator, Message from libmuscle.settings_manager import SettingsManager from libmuscle.logging import LogLevel @@ -85,7 +86,10 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None, self._snapshot_manager = SnapshotManager( self._instance_name(), self.__manager, self._communicator) - """Keeps track of checkpointing and snapshots""" + """Resumes, loads and saves snapshots.""" + + self._trigger_manager = TriggerManager() + """Keeps track of checkpoints and triggers snapshots.""" self._first_run = None # type: Optional[bool] """Whether this is the first iteration of the reuse loop""" @@ -108,9 +112,22 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None, self._register() self._connect() - # Note: SnapshotManager.get_checkpoint_info needs to have the ports - # initialized so it comes after self._connect() - self._snapshot_manager.get_checkpoint_info() + + # Note: get_checkpoint_info needs to have the ports initialized + # so it comes after self._connect() + checkpoint_info = self.__manager.get_checkpoint_info( + self._instance_name()) + + utc_reference, checkpoints = checkpoint_info[0:2] + self._trigger_manager.set_checkpoint_info(utc_reference, checkpoints) + + resume_snapshot, snapshot_dir = checkpoint_info[2:4] + saved_at = self._snapshot_manager.prepare_resume( + resume_snapshot, snapshot_dir) + + if saved_at is not None: + self._trigger_manager.update_checkpoints(saved_at) + self._set_local_log_level() self._set_remote_log_level() @@ -175,11 +192,10 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool: self._stateful is not ImplementationState.STATEFUL) if do_implicit_checkpoint: - if self._snapshot_manager.should_save_final_snapshot( + if self._trigger_manager.should_save_final_snapshot( do_reuse, self.__f_init_max_timestamp): # store a None instead of a Message - self._snapshot_manager.save_implicit_snapshot( - self.__f_init_max_timestamp) + self._save_snapshot(None, True, self.__f_init_max_timestamp) if not do_reuse: self.__close_ports() @@ -443,7 +459,7 @@ def snapshots_enabled(self) -> bool: Returns: True iff checkpoint rules are defined in the workflow yMMSL. """ - return self._snapshot_manager.snapshots_enabled() + return self._trigger_manager.snapshots_enabled() def resuming(self) -> bool: """Check if this instance is resuming from a snapshot. @@ -520,7 +536,7 @@ def should_save_snapshot(self, timestamp: float) -> bool: checkpoint rules provided in the ymmsl configuration. """ self._api_guard.verify_should_save_snapshot() - result = self._snapshot_manager.should_save_snapshot(timestamp) + result = self._trigger_manager.should_save_snapshot(timestamp) self._api_guard.should_save_snapshot_done(result) return result @@ -549,7 +565,7 @@ def save_snapshot(self, message: Message) -> None: store the internal state of the submodel. """ self._api_guard.verify_save_snapshot() - self._snapshot_manager.save_snapshot(message) + self._save_snapshot(message, False) self._api_guard.save_snapshot_done() def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool: @@ -585,9 +601,11 @@ def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool: to the checkpoint rules provided in the ymmsl configuration. """ self._api_guard.verify_should_save_final_snapshot() + self._do_reuse = self._decide_reuse_instance(apply_overlay) - result = self._snapshot_manager.should_save_final_snapshot( + result = self._trigger_manager.should_save_final_snapshot( self._do_reuse, self.__f_init_max_timestamp) + self._api_guard.should_save_final_snapshot_done(result) return result @@ -613,8 +631,7 @@ def save_final_snapshot(self, message: Message) -> None: submodel. """ self._api_guard.verify_save_final_snapshot() - self._snapshot_manager.save_final_snapshot( - message, self.__f_init_max_timestamp) + self._save_snapshot(message, True, self.__f_init_max_timestamp) self._api_guard.save_final_snapshot_done() @property @@ -746,6 +763,24 @@ def _decide_reuse_instance(self, apply_overlay: bool) -> bool: self._do_init = got_f_init_messages return got_f_init_messages + def _save_snapshot( + self, message: Optional[Message], final: bool, + f_init_max_timestamp: Optional[float] = None) -> None: + """Save a snapshot to disk and notify manager. + + Args: + message: The data to save + final: Whether this is a final snapshot or an intermediate + one + f_init_max_timestamp: Timestamp for final snapshots + """ + triggers = self._trigger_manager.get_triggers() + walltime = self._trigger_manager.elapsed_walltime() + timestamp = self._snapshot_manager.save_snapshot( + message, final, triggers, walltime, + f_init_max_timestamp) + self._trigger_manager.update_checkpoints(timestamp) + def __receive_message( self, port_name: str, slot: Optional[int], default: Optional[Message], with_settings: bool diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py index c42a24c7..f756d05d 100644 --- a/libmuscle/python/libmuscle/snapshot_manager.py +++ b/libmuscle/python/libmuscle/snapshot_manager.py @@ -1,11 +1,9 @@ import logging -from datetime import datetime from pathlib import Path -from typing import cast, Optional +from typing import cast, List, Optional -from ymmsl import Checkpoints, Reference, Operator +from ymmsl import Reference, Operator -from libmuscle.checkpoint_triggers import TriggerManager from libmuscle.communicator import Communicator, Message from libmuscle.mmp_client import MMPClient from libmuscle.snapshot import MsgPackSnapshot, Snapshot, SnapshotMetadata @@ -45,37 +43,37 @@ def __init__(self, self._communicator = communicator self._manager = manager - self._trigger_manager = TriggerManager() self._resume_from_snapshot = None # type: Optional[Snapshot] self._next_snapshot_num = 1 - def get_checkpoint_info(self) -> None: - """Request checkpoint info from the muscle manager. - """ - checkpoint_info = self._manager.get_checkpoint_info(self._instance_id) - self._set_checkpoint_info(*checkpoint_info) - - def _set_checkpoint_info(self, - utc_reference: datetime, - checkpoints: Checkpoints, - resume: Optional[Path], - snapshot_directory: Optional[Path]) -> None: + def prepare_resume( + self, resume_snapshot: Optional[Path], + snapshot_directory: Optional[Path]) -> Optional[float]: """Apply checkpoint info received from the manager. + If there is a snapshot to resume from, this loads it and does + any resume work that libmuscle should do, including restoring + message counts and storing the resumed-from snapshot again as + our first snapshot. + Args: - utc_reference: datetime (in UTC) indicating wallclock_time=0 - checkpoints: requested workflow checkpoints - resume: previous snapshot to resume from (or None if not resuming) + resume_snapshot: Snapshot to resume from (or None if not + resuming) + snapshot_directory: directory to save snapshots in + + Returns: + Time at which the initial snapshot was saved, if resuming. """ - self._trigger_manager.set_checkpoint_info(utc_reference, checkpoints) + result = None # type: Optional[float] self._snapshot_directory = snapshot_directory or Path.cwd() - if resume is not None: - snapshot = self.load_snapshot_from_file(resume) + if resume_snapshot is not None: + snapshot = self.load_snapshot_from_file(resume_snapshot) + if snapshot.message is not None: # snapshot.message is None for implicit snapshots self._resume_from_snapshot = snapshot - self._trigger_manager.update_checkpoints( - snapshot.message.timestamp) + result = snapshot.message.timestamp + self._communicator.restore_message_counts( snapshot.port_message_counts) # Store a copy of the snapshot in the current run directory @@ -83,13 +81,11 @@ def _set_checkpoint_info(self, metadata = SnapshotMetadata.from_snapshot(snapshot, str(path)) self._manager.submit_snapshot_metadata(self._instance_id, metadata) - def snapshots_enabled(self) -> bool: - """Check if the current workflow has snapshots enabled. - """ - return self._trigger_manager.snapshots_enabled() + return result def resuming_from_intermediate(self) -> bool: """Check whether we have an intermediate snapshot. + Doesn't say whether we should resume now, just that we were given an intermediate snapshot to resume from by the manager. """ @@ -99,9 +95,10 @@ def resuming_from_intermediate(self) -> bool: def resuming_from_final(self) -> bool: """Check whether we have a final snapshot. + Doesn't say whether we should resume now, just that we were given an intermediate snapshot to resume from by the manager. - """ + """ return ( self._resume_from_snapshot is not None and self._resume_from_snapshot.is_final_snapshot) @@ -112,54 +109,23 @@ def load_snapshot(self) -> Message: snapshot = cast(Snapshot, self._resume_from_snapshot) return cast(Message, snapshot.message) - def should_save_snapshot(self, timestamp: float) -> bool: - """See :meth:`TriggerManager.should_save_snapshot`. - """ - return self._trigger_manager.should_save_snapshot(timestamp) - - def should_save_final_snapshot( - self, do_reuse: bool, f_init_max_timestamp: Optional[float] - ) -> bool: - """See :meth:`TriggerManager.should_save_final_snapshot`. - """ - return self._trigger_manager.should_save_final_snapshot( - do_reuse, f_init_max_timestamp) - - def save_snapshot(self, msg: Message) -> None: - """Save snapshot contained in the message object. - """ - if not isinstance(msg, Message): - raise ValueError(_NO_MESSAGE_PROVIDED.format('save_snapshot')) - self.__save_snapshot(msg, False) - - def save_final_snapshot( - self, msg: Message, f_init_max_timestamp: Optional[float]) -> None: - """Save final snapshot contained in the message object. - """ - if not isinstance(msg, Message): - raise ValueError(_NO_MESSAGE_PROVIDED.format('save_final_snapshot')) - self.__save_snapshot(msg, True, f_init_max_timestamp) - - def save_implicit_snapshot( - self, f_init_max_timestamp: Optional[float]) -> None: - """Save final snapshot without a message. - """ - self.__save_snapshot(None, True, f_init_max_timestamp) - - def __save_snapshot( + def save_snapshot( self, msg: Optional[Message], final: bool, - f_init_max_timestamp: Optional[float] = None - ) -> None: - """Actual implementation used by save_(final_)snapshot. + triggers: List[str], wallclock_time: float, + f_init_max_timestamp: Optional[float] = None, + ) -> float: + """Save a (final) snapshot. Args: msg: Message object representing the snapshot. final: True iff called from save_final_snapshot. + triggers: Description of checkpoints that triggered this. + wallclock_time: Wallclock time when saving. f_init_max_timestamp: Timestamp for final snapshots. - """ - triggers = self._trigger_manager.get_triggers() - wallclock_time = self._trigger_manager.elapsed_walltime() + Returns: + Simulation time at which the snapshot was made + """ port_message_counts = self._communicator.get_message_counts() if final: # Decrease F_INIT port counts by one: F_INIT messages are already @@ -185,7 +151,7 @@ def __save_snapshot( # For final snapshots f_init_max_snapshot is the reference time (see # should_save_final_snapshot). timestamp = f_init_max_timestamp - self._trigger_manager.update_checkpoints(timestamp) + return timestamp @staticmethod def load_snapshot_from_file(snapshot_location: Path) -> Snapshot: diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py index fa976e14..31423bb0 100644 --- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py +++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py @@ -1,8 +1,7 @@ -from datetime import datetime, timezone from pathlib import Path from unittest.mock import MagicMock -from ymmsl import Reference, Checkpoints, CheckpointRangeRule +from ymmsl import Reference from libmuscle.communicator import Message from libmuscle.snapshot import SnapshotMetadata @@ -15,9 +14,7 @@ def test_no_checkpointing(tmp_path: Path) -> None: communicator.get_message_counts.return_value = {} snapshot_manager = SnapshotManager(Reference('test'), manager, communicator) - snapshot_manager._set_checkpoint_info( - datetime.now(timezone.utc), Checkpoints(), None, tmp_path) - + snapshot_manager.prepare_resume(None, tmp_path) assert not snapshot_manager.resuming_from_intermediate() assert not snapshot_manager.resuming_from_final() @@ -31,22 +28,20 @@ def test_save_load_snapshot(tmp_path: Path) -> None: instance_id = Reference('test[1]') snapshot_manager = SnapshotManager(instance_id, manager, communicator) - checkpoints = Checkpoints(simulation_time=[CheckpointRangeRule(every=1)]) - snapshot_manager._set_checkpoint_info( - datetime.now(timezone.utc), checkpoints, None, tmp_path) - - assert snapshot_manager.should_save_snapshot(0.2) + snapshot_manager.prepare_resume(None, tmp_path) assert not snapshot_manager.resuming_from_intermediate() assert not snapshot_manager.resuming_from_final() - snapshot_manager.save_snapshot(Message(0.2, None, 'test data')) + + snapshot_manager.save_snapshot( + Message(0.2, None, 'test data'), False, ['test'], 13.0) communicator.get_message_counts.assert_called_with() manager.submit_snapshot_metadata.assert_called() instance, metadata = manager.submit_snapshot_metadata.call_args[0] assert instance == instance_id assert isinstance(metadata, SnapshotMetadata) - assert metadata.triggers - assert metadata.wallclock_time > 0.0 + assert metadata.triggers == ['test'] + assert metadata.wallclock_time == 13.0 assert metadata.timestamp == 0.2 assert metadata.next_timestamp is None assert metadata.port_message_counts == port_message_counts @@ -57,8 +52,7 @@ def test_save_load_snapshot(tmp_path: Path) -> None: snapshot_manager2 = SnapshotManager(instance_id, manager, communicator) - snapshot_manager2._set_checkpoint_info( - datetime.now(timezone.utc), checkpoints, snapshot_path, tmp_path) + snapshot_manager2.prepare_resume(snapshot_path, tmp_path) communicator.restore_message_counts.assert_called_with(port_message_counts) assert snapshot_manager2.resuming_from_intermediate() @@ -68,16 +62,14 @@ def test_save_load_snapshot(tmp_path: Path) -> None: assert msg.next_timestamp is None assert msg.data == 'test data' - assert not snapshot_manager2.should_save_snapshot(0.4) - assert snapshot_manager2.should_save_final_snapshot(True, 1.2) - snapshot_manager2.save_final_snapshot( - Message(0.6, None, 'test data2'), 1.2) + snapshot_manager2.save_snapshot( + Message(0.6, None, 'test data2'), True, ['test'], 42.2, 1.2) instance, metadata = manager.submit_snapshot_metadata.call_args[0] assert instance == instance_id assert isinstance(metadata, SnapshotMetadata) - assert metadata.triggers - assert metadata.wallclock_time > 0.0 + assert metadata.triggers == ['test'] + assert metadata.wallclock_time == 42.2 assert metadata.timestamp == 0.6 assert metadata.next_timestamp is None assert metadata.port_message_counts == port_message_counts @@ -102,13 +94,13 @@ def test_save_load_implicit_snapshot(tmp_path: Path) -> None: instance_id = Reference('test[1]') snapshot_manager = SnapshotManager(instance_id, manager, communicator) - checkpoints = Checkpoints(simulation_time=[CheckpointRangeRule(every=1)]) - snapshot_manager._set_checkpoint_info( - datetime.now(timezone.utc), checkpoints, None, tmp_path) + snapshot_manager.prepare_resume(None, tmp_path) assert not snapshot_manager.resuming_from_intermediate() assert not snapshot_manager.resuming_from_final() - snapshot_manager.save_implicit_snapshot(1.5) + # save implicit snapshot + snapshot_manager.save_snapshot(None, True, ['implicit'], 1.0, 1.5) + manager.submit_snapshot_metadata.assert_called_once() instance, metadata = manager.submit_snapshot_metadata.call_args[0] assert instance == instance_id @@ -118,13 +110,12 @@ def test_save_load_implicit_snapshot(tmp_path: Path) -> None: snapshot_manager2 = SnapshotManager(instance_id, manager, communicator) - snapshot_manager2._set_checkpoint_info( - datetime.now(timezone.utc), checkpoints, snapshot_path, tmp_path) + snapshot_manager2.prepare_resume(snapshot_path, tmp_path) communicator.restore_message_counts.assert_called_with(port_message_counts) manager.submit_snapshot_metadata.assert_called_once() manager.submit_snapshot_metadata.reset_mock() assert not snapshot_manager2.resuming_from_intermediate() assert not snapshot_manager2.resuming_from_final() - snapshot_manager2.save_implicit_snapshot(2.5) + snapshot_manager2.save_snapshot(None, True, ['implicit'], 12.3, 2.5) manager.submit_snapshot_metadata.assert_called_once() From 57b618fc18c710cefadc10fb00c14c142f1b44a0 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Thu, 5 Jan 2023 19:47:08 +0100 Subject: [PATCH 120/183] Improve dealing with wall-clock time checkpoints --- integration_test/test_cpp_mpp_client.py | 2 +- libmuscle/cpp/src/libmuscle/communicator.cpp | 3 +- libmuscle/cpp/src/libmuscle/mpp_message.cpp | 5 +- libmuscle/cpp/src/libmuscle/mpp_message.hpp | 3 +- .../libmuscle/tests/mocks/mock_mpp_client.cpp | 3 +- .../tests/mocks/mock_post_office.cpp | 4 +- .../tests/tcp_transport_server_test.cpp | 2 +- .../src/libmuscle/tests/test_mpp_message.cpp | 8 ++- .../cpp/src/libmuscle/tests/test_outbox.cpp | 2 +- .../src/libmuscle/tests/test_post_office.cpp | 2 +- .../tests/test_tcp_communication.cpp | 3 +- .../python/libmuscle/checkpoint_triggers.py | 46 ++++++++-------- libmuscle/python/libmuscle/communicator.py | 39 +++++++------ libmuscle/python/libmuscle/instance.py | 19 +++++-- .../python/libmuscle/manager/mmp_server.py | 7 +-- .../manager/test/test_mmp_request_handler.py | 6 +- libmuscle/python/libmuscle/mmp_client.py | 19 +++---- libmuscle/python/libmuscle/mpp_message.py | 11 +++- libmuscle/python/libmuscle/snapshot.py | 2 +- .../test/test_checkpoint_triggers.py | 19 +++---- .../libmuscle/test/test_communicator.py | 55 +++++++++++-------- .../python/libmuscle/test/test_instance.py | 28 +++++----- .../python/libmuscle/test/test_mpp_message.py | 12 ++-- .../python/libmuscle/test/test_outbox.py | 2 +- 24 files changed, 167 insertions(+), 135 deletions(-) diff --git a/integration_test/test_cpp_mpp_client.py b/integration_test/test_cpp_mpp_client.py index 7541993e..976dc41c 100644 --- a/integration_test/test_cpp_mpp_client.py +++ b/integration_test/test_cpp_mpp_client.py @@ -23,7 +23,7 @@ def tcp_server_process(control_pipe): message = MPPMessage( Reference('test_sender.test_port'), receiver, - 10, 1.0, 2.0, settings, 0, data).encoded() + 10, 1.0, 2.0, settings, 0, 1.0, data).encoded() def handle_request(request_bytes): request = msgpack.unpackb(request_bytes, raw=False) diff --git a/libmuscle/cpp/src/libmuscle/communicator.cpp b/libmuscle/cpp/src/libmuscle/communicator.cpp index 269a1139..644d67ae 100644 --- a/libmuscle/cpp/src/libmuscle/communicator.cpp +++ b/libmuscle/cpp/src/libmuscle/communicator.cpp @@ -133,7 +133,8 @@ void Communicator::send_message( MPPMessage mpp_message( snd_endpoint.ref(), recv_endpoint.ref(), port_length, message.timestamp(), Optional(), - settings_overlay, port.get_num_messages(slot), message.data()); + settings_overlay, port.get_num_messages(slot), -1.0, + message.data()); if (message.has_next_timestamp()) mpp_message.next_timestamp = message.next_timestamp(); diff --git a/libmuscle/cpp/src/libmuscle/mpp_message.cpp b/libmuscle/cpp/src/libmuscle/mpp_message.cpp index bf1be0f0..344adb23 100644 --- a/libmuscle/cpp/src/libmuscle/mpp_message.cpp +++ b/libmuscle/cpp/src/libmuscle/mpp_message.cpp @@ -16,7 +16,7 @@ MPPMessage::MPPMessage( ::libmuscle::impl::Optional port_length, double timestamp, ::libmuscle::impl::Optional next_timestamp, DataConstRef const & settings_overlay, - int message_number, + int message_number, double saved_until, DataConstRef const & data ) : sender(sender) @@ -26,6 +26,7 @@ MPPMessage::MPPMessage( , next_timestamp(next_timestamp) , settings_overlay(settings_overlay) , message_number(message_number) + , saved_until(saved_until) , data(data) {} @@ -51,6 +52,7 @@ MPPMessage MPPMessage::from_bytes(DataConstRef const & data) { next_timestamp, dict["settings_overlay"], dict["message_number"].as(), + dict["saved_until"].as(), dict["data"]); } @@ -71,6 +73,7 @@ DataConstRef MPPMessage::encoded() const { "next_timestamp", next_timestamp_data, "settings_overlay", settings_overlay, "message_number", message_number, + "saved_until", saved_until, "data", data ); diff --git a/libmuscle/cpp/src/libmuscle/mpp_message.hpp b/libmuscle/cpp/src/libmuscle/mpp_message.hpp index 96a26fe0..69f15645 100644 --- a/libmuscle/cpp/src/libmuscle/mpp_message.hpp +++ b/libmuscle/cpp/src/libmuscle/mpp_message.hpp @@ -33,7 +33,7 @@ struct MPPMessage { ::libmuscle::impl::Optional port_length, double timestamp, ::libmuscle::impl::Optional next_timestamp, DataConstRef const & settings_overlay, int message_number, - DataConstRef const & data); + double saved_until, DataConstRef const & data); /** Create an MCP Message from an encoded buffer. * @@ -54,6 +54,7 @@ struct MPPMessage { ::libmuscle::impl::Optional next_timestamp; DataConstRef settings_overlay; int message_number; + double saved_until; DataConstRef data; }; diff --git a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mpp_client.cpp b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mpp_client.cpp index 55ae3a76..31507a44 100644 --- a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mpp_client.cpp +++ b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_mpp_client.cpp @@ -44,7 +44,8 @@ Settings MockMPPClient::make_overlay_() { } MPPMessage MockMPPClient::next_receive_message( - "test.out", "test2.in", 0, 0.0, 1.0, make_overlay_(), 0, Data::dict("test1", 12)); + "test.out", "test2.in", 0, 0.0, 1.0, make_overlay_(),0, 9.0, + Data::dict("test1", 12)); Reference MockMPPClient::last_receiver("_none"); diff --git a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_post_office.cpp b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_post_office.cpp index 6d2bb3cc..cb6a92d5 100644 --- a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_post_office.cpp +++ b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_post_office.cpp @@ -6,13 +6,13 @@ int MockPostOffice::handle_request( char const * res_buf, std::size_t res_len, std::unique_ptr & response) { response = std::make_unique( - MPPMessage("test.out", "test2.in", 0, 0.0, 1.0, Data(), 0, Data()).encoded()); + MPPMessage("test.out", "test2.in", 0, 0.0, 1.0, Data(), 0, 0.0, Data()).encoded()); return -1; } std::unique_ptr MockPostOffice::get_response(int fd) { return std::make_unique( - MPPMessage("test.out", "test2.in", 0, 0.0, 1.0, Data(), 0, Data()).encoded()); + MPPMessage("test.out", "test2.in", 0, 0.0, 1.0, Data(), 0, 8.0, Data()).encoded()); } void MockPostOffice::deposit( diff --git a/libmuscle/cpp/src/libmuscle/tests/tcp_transport_server_test.cpp b/libmuscle/cpp/src/libmuscle/tests/tcp_transport_server_test.cpp index 248f597f..7082a0bc 100644 --- a/libmuscle/cpp/src/libmuscle/tests/tcp_transport_server_test.cpp +++ b/libmuscle/cpp/src/libmuscle/tests/tcp_transport_server_test.cpp @@ -38,7 +38,7 @@ int main(int argc, char *argv[]) { "test_sender.port", receiver, 10, 0.0, 1.0, overlay_settings, - 0, + 0, 6.0, data_dict); auto msg_data = std::make_unique(msg.encoded()); post_office.deposit(receiver, std::move(msg_data)); diff --git a/libmuscle/cpp/src/libmuscle/tests/test_mpp_message.cpp b/libmuscle/cpp/src/libmuscle/tests/test_mpp_message.cpp index 53f2ed28..cebcedd5 100644 --- a/libmuscle/cpp/src/libmuscle/tests/test_mpp_message.cpp +++ b/libmuscle/cpp/src/libmuscle/tests/test_mpp_message.cpp @@ -29,7 +29,7 @@ TEST(test_mcp_message, create_mcp_message) { Reference("sender.port"), Reference("receiver.port"), 10, 100.1, 101.0, - test, 0, abc + test, 0, 1.0, abc ); ASSERT_EQ(m.sender, "sender.port"); @@ -39,6 +39,7 @@ TEST(test_mcp_message, create_mcp_message) { ASSERT_EQ(m.next_timestamp, 101.0); ASSERT_EQ(m.settings_overlay.as(), "test"); ASSERT_EQ(m.message_number, 0); + ASSERT_EQ(m.saved_until, 1.0); ASSERT_EQ(m.data.as(), "abc"); } @@ -49,7 +50,7 @@ TEST(test_mcp_message, create_mcp_message_minimal) { Reference("sender.port"), Reference("receiver.port"), {}, 100.1, {}, - test, 0, abc + test, 0, 2.0, abc ); ASSERT_EQ(m.sender, "sender.port"); @@ -59,6 +60,7 @@ TEST(test_mcp_message, create_mcp_message_minimal) { ASSERT_FALSE(m.next_timestamp.is_set()); ASSERT_TRUE(m.settings_overlay.is_nil()); ASSERT_EQ(m.message_number, 0); + ASSERT_EQ(m.saved_until, 2.0); ASSERT_TRUE(m.data.is_nil()); } @@ -71,6 +73,7 @@ TEST(test_mcp_message, from_bytes) { "next_timestamp", Data(), "settings_overlay", Data(), "message_number", 0, + "saved_until", 3.0, "data", Data() ); @@ -88,6 +91,7 @@ TEST(test_mcp_message, from_bytes) { ASSERT_FALSE(m.next_timestamp.is_set()); ASSERT_TRUE(m.settings_overlay.is_nil()); ASSERT_EQ(m.message_number, 0); + ASSERT_EQ(m.saved_until, 3.0); ASSERT_TRUE(m.data.is_nil()); } diff --git a/libmuscle/cpp/src/libmuscle/tests/test_outbox.cpp b/libmuscle/cpp/src/libmuscle/tests/test_outbox.cpp index 0d6769c5..e98bd423 100644 --- a/libmuscle/cpp/src/libmuscle/tests/test_outbox.cpp +++ b/libmuscle/cpp/src/libmuscle/tests/test_outbox.cpp @@ -36,7 +36,7 @@ TEST(libmuscle_outbox, test_deposit_retrieve_message) { Optional(), 0.0, 1.0, DataConstRef(), - 0, + 0, 1.0, DataConstRef("testing")); auto message_data = std::make_unique(message.encoded()); diff --git a/libmuscle/cpp/src/libmuscle/tests/test_post_office.cpp b/libmuscle/cpp/src/libmuscle/tests/test_post_office.cpp index f6cf05c2..bf6981d8 100644 --- a/libmuscle/cpp/src/libmuscle/tests/test_post_office.cpp +++ b/libmuscle/cpp/src/libmuscle/tests/test_post_office.cpp @@ -36,7 +36,7 @@ std::unique_ptr make_message() { "test_sender.port", "test_receiver.port", Optional(), 0.0, 1.0, - DataConstRef(), 0, DataConstRef()); + DataConstRef(), 0, 5.0, DataConstRef()); return std::make_unique(msg.encoded()); } diff --git a/libmuscle/cpp/src/libmuscle/tests/test_tcp_communication.cpp b/libmuscle/cpp/src/libmuscle/tests/test_tcp_communication.cpp index c6400404..2d152161 100644 --- a/libmuscle/cpp/src/libmuscle/tests/test_tcp_communication.cpp +++ b/libmuscle/cpp/src/libmuscle/tests/test_tcp_communication.cpp @@ -36,7 +36,7 @@ TEST(test_tcp_communication, send_receive) { MPPMessage msg( "test_sender.port", receiver, 10, 0.0, 1.0, - Data::dict("par1", 13), 1, + Data::dict("par1", 13), 1, 4.0, Data::dict("var1", 1, "var2", 2.0, "var3", "3")); auto msg_data = std::make_unique(msg.encoded()); post_office.deposit(receiver, std::move(msg_data)); @@ -54,6 +54,7 @@ TEST(test_tcp_communication, send_receive) { ASSERT_EQ(m.next_timestamp, 1.0); ASSERT_EQ(m.settings_overlay["par1"].as(), 13); ASSERT_EQ(m.message_number, 1); + ASSERT_EQ(m.saved_until, 4.0); ASSERT_EQ(m.data["var1"].as(), 1); ASSERT_EQ(m.data["var2"].as(), 2.0); ASSERT_EQ(m.data["var3"].as(), "3"); diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py index 88a561f8..7ea2759d 100644 --- a/libmuscle/python/libmuscle/checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/checkpoint_triggers.py @@ -1,5 +1,4 @@ import bisect -from datetime import datetime, timezone import logging import time from typing import List, Optional, Union @@ -160,18 +159,6 @@ def previous_checkpoint(self, cur_time: float) -> Optional[float]: default=None) # return None if all triggers return None -def _utc_to_monotonic(utc: datetime) -> float: - """Convert UTC time point to a reference value of time.monotonic() - - Args: - utc: datetime in UTC timezone - """ - curmono = time.monotonic() - curutc = datetime.now(timezone.utc) - elapsed_seconds = (curutc - utc).total_seconds() - return curmono - elapsed_seconds - - class TriggerManager: """Manages all checkpoint triggers and checks if a snapshot must be saved. """ @@ -179,18 +166,19 @@ class TriggerManager: def __init__(self) -> None: self._has_checkpoints = False self._last_triggers = [] # type: List[str] - self._monotonic_reference = time.monotonic() + self._cpts_considered_until = float('-inf') def set_checkpoint_info( - self, utc_reference: datetime, checkpoints: Checkpoints) -> None: + self, elapsed: float, checkpoints: Checkpoints) -> None: """Register checkpoint info received from the muscle manager. """ + self._mono_to_elapsed = elapsed - time.monotonic() + if not checkpoints: self._has_checkpoints = False return self._has_checkpoints = True - self._monotonic_reference = _utc_to_monotonic(utc_reference) self._checkpoint_at_end = checkpoints.at_end @@ -206,7 +194,19 @@ def set_checkpoint_info( def elapsed_walltime(self) -> float: """Returns elapsed wallclock_time in seconds. """ - return time.monotonic() - self._monotonic_reference + return time.monotonic() + self._mono_to_elapsed + + def checkpoints_considered_until(self) -> float: + """Return elapsed time of last should_save* + """ + return self._cpts_considered_until + + def harmonise_wall_time(self, at_least: float) -> None: + """Ensure our elapsed time is at least the given value + """ + cur = self.elapsed_walltime() + if cur < at_least: + self._mono_to_elapsed += at_least - cur def snapshots_enabled(self) -> bool: """Check if the current workflow has snapshots enabled. @@ -219,8 +219,7 @@ def should_save_snapshot(self, timestamp: float) -> bool: if not self._has_checkpoints: return False - elapsed_walltime = self.elapsed_walltime() - return self.__should_save(elapsed_walltime, timestamp) + return self.__should_save(timestamp) def should_save_final_snapshot( self, do_reuse: bool, f_init_max_timestamp: Optional[float] @@ -241,8 +240,7 @@ def should_save_final_snapshot( ' Not creating a snapshot.') self._sim_reset = True else: - elapsed_walltime = self.elapsed_walltime() - value = self.__should_save(elapsed_walltime, f_init_max_timestamp) + value = self.__should_save(f_init_max_timestamp) return value @@ -270,11 +268,10 @@ def get_triggers(self) -> List[str]: self._last_triggers = [] return triggers - def __should_save(self, walltime: float, simulation_time: float) -> bool: + def __should_save(self, simulation_time: float) -> bool: """Check if a checkpoint should be taken Args: - walltime: current wallclock time (elapsed since reference) simulation_time: current/next timestamp as reported by the instance """ if self._sim_reset: @@ -290,6 +287,9 @@ def __should_save(self, walltime: float, simulation_time: float) -> bool: self._nextsim = self._sim.next_checkpoint(simulation_time) self._sim_reset = False + walltime = self.elapsed_walltime() + self._cpts_considered_until = walltime + self._last_triggers = [] if self._nextwall is not None and walltime >= self._nextwall: self._last_triggers.append(f"wallclock_time >= {self._nextwall}") diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py index bf1cf33e..69272f78 100644 --- a/libmuscle/python/libmuscle/communicator.py +++ b/libmuscle/python/libmuscle/communicator.py @@ -181,7 +181,8 @@ def get_port(self, port_name: str) -> Port: def send_message( self, port_name: str, message: Message, - slot: Optional[int] = None) -> None: + slot: Optional[int] = None, + checkpoints_considered_until: float = float('-inf')) -> None: """Send a message and settings to the outside world. Sending is non-blocking, a copy of the message will be made @@ -191,6 +192,8 @@ def send_message( port_name: The port on which this message is to be sent. message: The message to be sent. slot: The slot to send the message on, if any. + checkpoints_considered_until: When we last checked if we + should save a snapshot (wallclock time). """ if slot is None: _logger.debug('Sending message on {}'.format(port_name)) @@ -227,6 +230,7 @@ def send_message( message.timestamp, message.next_timestamp, cast(Settings, message.settings), port.get_num_messages(slot), + checkpoints_considered_until, message.data) encoded_message = mcp_message.encoded() self._post_office.deposit(recv_endpoint.ref(), encoded_message) @@ -240,7 +244,7 @@ def send_message( def receive_message(self, port_name: str, slot: Optional[int] = None, default: Optional[Message] = None - ) -> Message: + ) -> Tuple[Message, float]: """Receive a message and attached settings overlay. Receiving is a blocking operation. This function will contact @@ -260,7 +264,8 @@ def receive_message(self, port_name: str, slot: Optional[int] = None, Returns: The received message, with message.settings holding the settings overlay. The settings attribute is - guaranteed to not be None. + guaranteed to not be None. Secondly, the saved_until + metadata field from the received message. Raises: RuntimeError: If no default was given and the port is not @@ -286,7 +291,7 @@ def receive_message(self, port_name: str, slot: Optional[int] = None, _logger.debug( 'No message received on {} as it is not connected'.format( port_name)) - return default + return default, float('-inf') if port_name in self._ports: port = self._ports[port_name] @@ -304,28 +309,28 @@ def receive_message(self, port_name: str, slot: Optional[int] = None, snd_endpoint = self._peer_manager.get_peer_endpoints( recv_endpoint.port, slot_list)[0] client = self.__get_client(snd_endpoint.instance()) - mcp_message_bytes = client.receive(recv_endpoint.ref()) - mcp_message = MPPMessage.from_bytes(mcp_message_bytes) + mpp_message_bytes = client.receive(recv_endpoint.ref()) + mpp_message = MPPMessage.from_bytes(mpp_message_bytes) - if mcp_message.port_length is not None: + if mpp_message.port_length is not None: if port.is_resizable(): - port.set_length(mcp_message.port_length) + port.set_length(mpp_message.port_length) - if isinstance(mcp_message.data, ClosePort): + if isinstance(mpp_message.data, ClosePort): port.set_closed(slot) message = Message( - mcp_message.timestamp, mcp_message.next_timestamp, - mcp_message.data, mcp_message.settings_overlay) + mpp_message.timestamp, mpp_message.next_timestamp, + mpp_message.data, mpp_message.settings_overlay) profile_event.stop() if port.is_vector(): profile_event.port_length = port.get_length() - profile_event.message_size = len(mcp_message_bytes) + profile_event.message_size = len(mpp_message_bytes) expected_message_number = port.get_num_messages(slot) - if expected_message_number != mcp_message.message_number: - if (expected_message_number - 1 == mcp_message.message_number and + if expected_message_number != mpp_message.message_number: + if (expected_message_number - 1 == mpp_message.message_number and port.is_resuming(slot)): _logger.debug(f'Discarding received message on {port_and_slot}' ': resuming from weakly consistent snapshot') @@ -333,16 +338,16 @@ def receive_message(self, port_name: str, slot: Optional[int] = None, return self.receive_message(port_name, slot, default) raise RuntimeError(f'Received message on {port_and_slot} with' ' unexpected message number' - f' {mcp_message.message_number}. Was expecting' + f' {mpp_message.message_number}. Was expecting' f' {expected_message_number}. Are you resuming' ' from an inconsistent snapshot?') port.increment_num_messages(slot) _logger.debug('Received message on {}'.format(port_and_slot)) - if isinstance(mcp_message.data, ClosePort): + if isinstance(mpp_message.data, ClosePort): _logger.debug('Port {} is now closed'.format(port_and_slot)) - return message + return message, mpp_message.saved_until def close_port(self, port_name: str, slot: Optional[int] = None ) -> None: diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index a94ccb76..1f8dc1dd 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -118,8 +118,8 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None, checkpoint_info = self.__manager.get_checkpoint_info( self._instance_name()) - utc_reference, checkpoints = checkpoint_info[0:2] - self._trigger_manager.set_checkpoint_info(utc_reference, checkpoints) + elapsed_time, checkpoints = checkpoint_info[0:2] + self._trigger_manager.set_checkpoint_info(elapsed_time, checkpoints) resume_snapshot, snapshot_dir = checkpoint_info[2:4] saved_at = self._snapshot_manager.prepare_resume( @@ -378,7 +378,9 @@ def send(self, port_name: str, message: Message, message = copy(message) message.settings = self._settings_manager.overlay - self._communicator.send_message(port_name, message, slot) + self._communicator.send_message( + port_name, message, slot, + self._trigger_manager.checkpoints_considered_until()) def receive(self, port_name: str, slot: Optional[int] = None, default: Optional[Message] = None @@ -826,7 +828,7 @@ def __receive_message( raise RuntimeError(err_msg) else: - msg = self._communicator.receive_message( + msg, saved_until = self._communicator.receive_message( port_name, slot, default) if port.is_connected() and not port.is_open(slot): err_msg = (('Port {} was closed while trying to' @@ -838,6 +840,7 @@ def __receive_message( self.__check_compatibility(port_name, msg.settings) if not with_settings: msg.settings = None + self._trigger_manager.harmonise_wall_time(saved_until) return msg def __make_full_name(self @@ -942,7 +945,7 @@ def __receive_settings(self) -> bool: False iff the port is connnected and ClosePort was received. """ default_message = Message(0.0, None, Settings(), Settings()) - message = self._communicator.receive_message( + message, saved_until = self._communicator.receive_message( 'muscle_settings_in', None, default_message) if isinstance(message.data, ClosePort): return False @@ -959,6 +962,8 @@ def __receive_settings(self) -> bool: for key, value in message.data.items(): settings[key] = value self._settings_manager.overlay = settings + + self._trigger_manager.harmonise_wall_time(saved_until) return True def __pre_receive_f_init(self, apply_overlay: bool) -> None: @@ -968,12 +973,14 @@ def __pre_receive_f_init(self, apply_overlay: bool) -> None: in self._f_init_cache. """ def pre_receive(port_name: str, slot: Optional[int]) -> None: - msg = self._communicator.receive_message(port_name, slot) + msg, saved_until = self._communicator.receive_message( + port_name, slot) self._f_init_cache[(port_name, slot)] = msg if apply_overlay: self.__apply_overlay(msg) self.__check_compatibility(port_name, msg.settings) msg.settings = None + self._trigger_manager.harmonise_wall_time(saved_until) self._f_init_cache = dict() ports = self._communicator.list_ports() diff --git a/libmuscle/python/libmuscle/manager/mmp_server.py b/libmuscle/python/libmuscle/manager/mmp_server.py index f689e6f5..d609fce1 100644 --- a/libmuscle/python/libmuscle/manager/mmp_server.py +++ b/libmuscle/python/libmuscle/manager/mmp_server.py @@ -1,6 +1,6 @@ -from datetime import datetime, timezone import errno import logging +import time from typing import Any, Dict, cast, List, Optional import msgpack @@ -74,8 +74,7 @@ def __init__( self._topology_store = topology_store self._snapshot_registry = snapshot_registry self._run_dir = run_dir - self._reference_time = datetime.now(timezone.utc) - self._reference_timestamp = self._reference_time.timestamp() + self._reference_time = time.monotonic() def handle_request(self, request: bytes) -> bytes: """Handles a manager request. @@ -303,7 +302,7 @@ def _get_checkpoint_info(self, instance_id: str) -> Any: snapshot_directory = str(self._run_dir.snapshot_dir(instance)) return [ResponseType.SUCCESS.value, - self._reference_timestamp, + time.monotonic() - self._reference_time, encode_checkpoints(self._configuration.checkpoints), resume, snapshot_directory] diff --git a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py index 876ae197..bc61f0a0 100644 --- a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py +++ b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py @@ -1,5 +1,4 @@ import dataclasses -from datetime import datetime, timezone from pathlib import Path from unittest.mock import MagicMock @@ -109,10 +108,9 @@ def test_get_checkpoint_info(mmp_configuration, mmp_request_handler): decoded_result = msgpack.unpackb(result, raw=False) assert decoded_result[0] == ResponseType.SUCCESS.value - timestamp, checkpoints, resume, snapshot_directory = decoded_result[1:] + elapsed_time, checkpoints, resume, snapshot_directory = decoded_result[1:] - ref_time = datetime.fromtimestamp(timestamp, tz=timezone.utc) - assert ref_time == mmp_request_handler._reference_time + assert elapsed_time > 0.0 assert isinstance(checkpoints, dict) assert checkpoints.keys() == {'at_end', 'wallclock_time', 'simulation_time'} diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py index 14e83e9a..eed4d99a 100644 --- a/libmuscle/python/libmuscle/mmp_client.py +++ b/libmuscle/python/libmuscle/mmp_client.py @@ -1,5 +1,4 @@ import dataclasses -from datetime import datetime, timezone from pathlib import Path from random import uniform from time import perf_counter, sleep @@ -23,7 +22,7 @@ PEER_INTERVAL_MAX = 10.0 _CheckpointInfoType = Tuple[ - datetime, Checkpoints, Optional[Path], Optional[Path]] + float, Checkpoints, Optional[Path], Optional[Path]] def encode_operator(op: Operator) -> str: @@ -64,7 +63,7 @@ def decode_checkpoint_rule(rule: Dict[str, Any]) -> CheckpointRule: def decode_checkpoint_info( - reference_timestamp: float, + elapsed_time: float, checkpoints_dict: Dict[str, Any], resume: Optional[str], snapshot_dir: Optional[str] @@ -72,19 +71,17 @@ def decode_checkpoint_info( """Decode checkpoint info from a MsgPack-compatible value. Args: - reference_timestamp: seconds since UNIX epoch in UTC timezone to use as - wallclock_time = 0 + elapsed_time: current elapsed time according to the manager checkpoints_dict: checkpoint definitions from the MsgPack resume: path to the snapshot we should resume from, if any snapshot_dir: path to the directory to store new snapshots in Returns: - wallclock_time_reference: UTC time where wallclock_time = 0 + elapsed_time: current elapsed time according to the manager checkpoints: checkpoint configuration - resume: path to the resume snapshot - snapshot_dir: path to store the snapshots in + resume: path to the snapshot we should resume from, if any + snapshot_dir: path to the directory to store new snapshots in """ - ref_time = datetime.fromtimestamp(reference_timestamp, tz=timezone.utc) checkpoints = Checkpoints( at_end=checkpoints_dict["at_end"], wallclock_time=[decode_checkpoint_rule(rule) @@ -93,7 +90,7 @@ def decode_checkpoint_info( for rule in checkpoints_dict["simulation_time"]]) resume_path = None if resume is None else Path(resume) snapshot_path = None if snapshot_dir is None else Path(snapshot_dir) - return (ref_time, checkpoints, resume_path, snapshot_path) + return (elapsed_time, checkpoints, resume_path, snapshot_path) class MMPClient(): @@ -173,7 +170,7 @@ def get_checkpoint_info(self, name: Reference) -> _CheckpointInfoType: """Get the checkpoint info from the manager. Returns: - wallclock_time_reference: UTC time where wallclock_time = 0 + elapsed_time: current elapsed time checkpoints: checkpoint configuration resume: path to the resume snapshot snapshot_directory: path to store snapshots diff --git a/libmuscle/python/libmuscle/mpp_message.py b/libmuscle/python/libmuscle/mpp_message.py index b9033d75..c57effb3 100644 --- a/libmuscle/python/libmuscle/mpp_message.py +++ b/libmuscle/python/libmuscle/mpp_message.py @@ -151,7 +151,8 @@ class MPPMessage: def __init__(self, sender: Reference, receiver: Reference, port_length: Optional[int], timestamp: float, next_timestamp: Optional[float], - settings_overlay: Settings, message_number: int, data: Any + settings_overlay: Settings, message_number: int, + saved_until: float, data: Any ) -> None: """Create an MPPMessage. @@ -169,6 +170,9 @@ def __init__(self, sender: Reference, receiver: Reference, receiver: The receiving endpoint. port_length: Length of the slot, where applicable. settings_overlay: The serialised overlay settings. + message_number: Sequence number on this conduit. + saved_until: Elapsed time until which the sender has + processed checkpoints. data: The serialised contents of the message. """ # make sure timestamp and next_timestamp are floats @@ -183,6 +187,7 @@ def __init__(self, sender: Reference, receiver: Reference, self.next_timestamp = next_timestamp self.settings_overlay = settings_overlay self.message_number = message_number + self.saved_until = saved_until if isinstance(data, np.ndarray): self.data = Grid(data) else: @@ -204,11 +209,12 @@ def from_bytes(message: bytes) -> 'MPPMessage': next_timestamp = message_dict["next_timestamp"] settings_overlay = message_dict["settings_overlay"] message_number = message_dict["message_number"] + saved_until = message_dict["saved_until"] data = message_dict["data"] return MPPMessage( sender, receiver, port_length, timestamp, next_timestamp, - settings_overlay, message_number, data) + settings_overlay, message_number, saved_until, data) def encoded(self) -> bytes: """Encode the message and return as a bytes buffer. @@ -221,6 +227,7 @@ def encoded(self) -> bytes: 'next_timestamp': self.next_timestamp, 'settings_overlay': self.settings_overlay, 'message_number': self.message_number, + 'saved_until': self.saved_until, 'data': self.data } diff --git a/libmuscle/python/libmuscle/snapshot.py b/libmuscle/python/libmuscle/snapshot.py index 633d3f3d..2f86a220 100644 --- a/libmuscle/python/libmuscle/snapshot.py +++ b/libmuscle/python/libmuscle/snapshot.py @@ -84,7 +84,7 @@ def message_to_bytes(message: Optional['communicator.Message']) -> bytes: settings = message.settings return MPPMessage(Reference('_'), Reference('_'), None, message.timestamp, message.next_timestamp, - settings, 0, message.data).encoded() + settings, 0, -1.0, message.data).encoded() @staticmethod def bytes_to_message(data: bytes) -> Optional['communicator.Message']: diff --git a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py index 388e6eca..e111a758 100644 --- a/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/test/test_checkpoint_triggers.py @@ -1,4 +1,3 @@ -from datetime import datetime, timedelta, timezone import time import pytest from ymmsl import CheckpointRangeRule, CheckpointAtRule, Checkpoints @@ -137,20 +136,19 @@ def test_combined_checkpoint_trigger_at_ranges(): def test_trigger_manager_reference_time(): - monotonic_now = time.monotonic() - utcnow = datetime.now(timezone.utc) - reference = utcnow - timedelta(seconds=15) + monotonic_start = time.monotonic() + ref_elapsed = 15.0 trigger_manager = TriggerManager() - trigger_manager.set_checkpoint_info(reference, Checkpoints(at_end=True)) + trigger_manager.set_checkpoint_info(ref_elapsed, Checkpoints(at_end=True)) elapsed_walltime = trigger_manager.elapsed_walltime() - elapsed_monotonic = time.monotonic() - monotonic_now - assert 15.0 < elapsed_walltime <= (15.0 + elapsed_monotonic) + duration = time.monotonic() - monotonic_start + assert ref_elapsed < elapsed_walltime <= (ref_elapsed + duration) def test_trigger_manager(): - reference = datetime.now(timezone.utc) + ref_elapsed = 0.0 trigger_manager = TriggerManager() - trigger_manager.set_checkpoint_info(reference, Checkpoints( + trigger_manager.set_checkpoint_info(ref_elapsed, Checkpoints( at_end=True, wallclock_time=[CheckpointAtRule([1e-12])], simulation_time=[CheckpointAtRule([1, 3, 5])])) @@ -181,8 +179,7 @@ def test_trigger_manager(): def test_no_checkpointing() -> None: trigger_manager = TriggerManager() - trigger_manager.set_checkpoint_info( - datetime.now(timezone.utc), Checkpoints()) + trigger_manager.set_checkpoint_info(0.0, Checkpoints()) assert not trigger_manager.should_save_snapshot(1) assert not trigger_manager.should_save_snapshot(5000) diff --git a/libmuscle/python/libmuscle/test/test_communicator.py b/libmuscle/python/libmuscle/test/test_communicator.py index a297a820..140e9399 100644 --- a/libmuscle/python/libmuscle/test/test_communicator.py +++ b/libmuscle/python/libmuscle/test/test_communicator.py @@ -397,28 +397,31 @@ def test_receive_message(communicator) -> None: client_mock = MagicMock() client_mock.receive.return_value = MPPMessage( Reference('other.out[13]'), Reference('kernel[13].in'), - None, 0.0, None, Settings({'test1': 12}), 0, + None, 0.0, None, Settings({'test1': 12}), 0, 2.0, b'test').encoded() get_client_mock = MagicMock(return_value=client_mock) communicator._Communicator__get_client = get_client_mock communicator._profiler = MagicMock() - msg = communicator.receive_message('in') + msg, last_saved = communicator.receive_message('in') get_client_mock.assert_called_with(Reference('other')) client_mock.receive.assert_called_with(Reference('kernel[13].in')) assert msg.data == b'test' assert msg.settings['test1'] == 12 + assert last_saved == 2.0 def test_receive_message_default(communicator) -> None: communicator._peer_manager.is_connected.return_value = False default_msg = Message(3.0, 4.0, 'test', Settings()) - msg = communicator.receive_message('not_connected', default=default_msg) + msg, last_saved = communicator.receive_message( + 'not_connected', default=default_msg) assert msg.timestamp == 3.0 assert msg.next_timestamp == 4.0 assert msg.data == 'test' assert len(msg.settings) == 0 + assert last_saved == float('-inf') def test_receive_message_no_default(communicator) -> None: @@ -436,71 +439,75 @@ def test_receive_msgpack(communicator) -> None: client_mock = MagicMock() client_mock.receive.return_value = MPPMessage( Reference('other.out[13]'), Reference('kernel[13].in'), - None, 0.0, None, Settings({'test1': 12}), 0, + None, 0.0, None, Settings({'test1': 12}), 0, 1.0, {'test': 13}).encoded() get_client_mock = MagicMock(return_value=client_mock) communicator._Communicator__get_client = get_client_mock communicator._profiler = MagicMock() - msg = communicator.receive_message('in') + msg, last_saved = communicator.receive_message('in') get_client_mock.assert_called_with(Reference('other')) client_mock.receive.assert_called_with(Reference('kernel[13].in')) assert msg.data == {'test': 13} + assert last_saved == 1.0 def test_receive_with_slot(communicator2) -> None: client_mock = MagicMock() client_mock.receive.return_value = MPPMessage( Reference('kernel[13].out'), Reference('other.in[13]'), - None, 0.0, None, Settings({'test': 'testing'}), 0, + None, 0.0, None, Settings({'test': 'testing'}), 0, 3.0, b'test').encoded() get_client_mock = MagicMock(return_value=client_mock) communicator2._Communicator__get_client = get_client_mock communicator2._profiler = MagicMock() - msg = communicator2.receive_message('in', 13) + msg, last_saved = communicator2.receive_message('in', 13) get_client_mock.assert_called_with(Reference('kernel[13]')) client_mock.receive.assert_called_with(Reference('other.in[13]')) assert msg.data == b'test' assert msg.settings['test'] == 'testing' + assert last_saved == 3.0 def test_receive_message_resizable(communicator3) -> None: client_mock = MagicMock() client_mock.receive.return_value = MPPMessage( Reference('other.out[13]'), Reference('kernel.in[13]'), - 20, 0.0, None, Settings({'test': 'testing'}), 0, + 20, 0.0, None, Settings({'test': 'testing'}), 0, 12.3, b'test').encoded() get_client_mock = MagicMock(return_value=client_mock) communicator3._Communicator__get_client = get_client_mock communicator3._profiler = MagicMock() - msg = communicator3.receive_message('in', 13) + msg, last_saved = communicator3.receive_message('in', 13) get_client_mock.assert_called_with(Reference('other')) client_mock.receive.assert_called_with(Reference('kernel.in[13]')) assert msg.data == b'test' assert communicator3.get_port('in').get_length() == 20 + assert last_saved == 12.3 def test_receive_with_settings(communicator) -> None: client_mock = MagicMock() client_mock.receive.return_value = MPPMessage( Reference('other.out[13]'), Reference('kernel[13].in'), - None, 0.0, None, Settings({'test2': 3.1}), 0, + None, 0.0, None, Settings({'test2': 3.1}), 0, 0.1, b'test').encoded() get_client_mock = MagicMock(return_value=client_mock) communicator._Communicator__get_client = get_client_mock communicator._profiler = MagicMock() - msg = communicator.receive_message('in') + msg, last_saved = communicator.receive_message('in') get_client_mock.assert_called_with(Reference('other')) client_mock.receive.assert_called_with(Reference('kernel[13].in')) assert msg.data == b'test' assert msg.settings['test2'] == 3.1 + assert last_saved == 0.1 def test_receive_msgpack_with_slot_and_settings(communicator2) -> None: @@ -508,56 +515,58 @@ def test_receive_msgpack_with_slot_and_settings(communicator2) -> None: client_mock.receive.return_value = MPPMessage( Reference('kernel[13].out'), Reference('other.in[13]'), None, 0.0, 1.0, - Settings({'test': 'testing'}), 0, 'test').encoded() + Settings({'test': 'testing'}), 0, 1.0, 'test').encoded() get_client_mock = MagicMock(return_value=client_mock) communicator2._Communicator__get_client = get_client_mock communicator2._profiler = MagicMock() - msg = communicator2.receive_message('in', 13) + msg, last_saved = communicator2.receive_message('in', 13) get_client_mock.assert_called_with(Reference('kernel[13]')) client_mock.receive.assert_called_with(Reference('other.in[13]')) assert msg.data == 'test' assert msg.settings['test'] == 'testing' + assert last_saved == 1.0 def test_receive_settings(communicator) -> None: client_mock = MagicMock() client_mock.receive.return_value = MPPMessage( Reference('other.out[13]'), Reference('kernel[13].in'), - None, 0.0, None, Settings({'test1': 12}), 0, + None, 0.0, None, Settings({'test1': 12}), 0, 1.0, Settings({'test': 13})).encoded() get_client_mock = MagicMock(return_value=client_mock) communicator._Communicator__get_client = get_client_mock communicator._profiler = MagicMock() - msg = communicator.receive_message('in') + msg, last_saved = communicator.receive_message('in') get_client_mock.assert_called_with(Reference('other')) client_mock.receive.assert_called_with(Reference('kernel[13].in')) assert isinstance(msg.data, Settings) assert msg.data['test'] == 13 + assert last_saved == 1.0 def test_receive_close_port(communicator) -> None: client_mock = MagicMock() client_mock.receive.return_value = MPPMessage( Reference('other.out[13]'), Reference('kernel[13].in'), - None, 0.0, None, Settings(), 0, ClosePort()).encoded() + None, 0.0, None, Settings(), 0, 0.1, ClosePort()).encoded() get_client_mock = MagicMock(return_value=client_mock) communicator._Communicator__get_client = get_client_mock communicator._profiler = MagicMock() - msg = communicator.receive_message('in') + msg, _ = communicator.receive_message('in') assert isinstance(msg.data, ClosePort) def test_get_message(communicator, message) -> None: - communicator.send_message('out', message) + communicator.send_message('out', message, None, 2.0) ref_message = MPPMessage( Reference('kernel[13].out'), Reference('other.in[13]'), - None, 0.0, None, Settings(), 0, b'test').encoded() + None, 0.0, None, Settings(), 0, 2.0, b'test').encoded() assert communicator._post_office.get_message( 'other.in[13]') == ref_message @@ -616,7 +625,7 @@ def test_port_count_validation(communicator): client_mock = MagicMock() client_mock.receive.return_value = MPPMessage( Reference('other.out[13]'), Reference('kernel[13].in'), - None, 0.0, None, Settings({'test1': 12}), 0, + None, 0.0, None, Settings({'test1': 12}), 0, 7.6, b'test').encoded() get_client_mock = MagicMock(return_value=client_mock) communicator._Communicator__get_client = get_client_mock @@ -634,7 +643,7 @@ def test_port_discard_error_on_resume(caplog, communicator): client_mock = MagicMock() client_mock.receive.return_value = MPPMessage( Reference('other.out[13]'), Reference('kernel[13].in'), - None, 0.0, None, Settings({'test1': 12}), 1, + None, 0.0, None, Settings({'test1': 12}), 1, 2.3, b'test').encoded() get_client_mock = MagicMock(return_value=client_mock) communicator._Communicator__get_client = get_client_mock @@ -661,7 +670,7 @@ def test_port_discard_success_on_resume(caplog, communicator): client_mock = MagicMock() client_mock.receive.side_effect = [MPPMessage( Reference('other.out[13]'), Reference('kernel[13].in'), - None, 0.0, None, Settings({'test1': 12}), message_number, + None, 0.0, None, Settings({'test1': 12}), message_number, 1.0, {'this is message': message_number}).encoded() for message_number in [1, 2]] get_client_mock = MagicMock(return_value=client_mock) @@ -676,7 +685,7 @@ def test_port_discard_success_on_resume(caplog, communicator): assert port.is_resuming(None) with caplog.at_level(logging.DEBUG, 'libmuscle.communicator'): - msg = communicator.receive_message('in') + msg, _ = communicator.receive_message('in') # records 0, 2 and 3 are debug logs for starting/receiving on port assert 'Discarding received message' in caplog.records[1].message # message_number=1 should be discarded: diff --git a/libmuscle/python/libmuscle/test/test_instance.py b/libmuscle/python/libmuscle/test/test_instance.py index 17e3e3e0..7c8b1be4 100644 --- a/libmuscle/python/libmuscle/test/test_instance.py +++ b/libmuscle/python/libmuscle/test/test_instance.py @@ -1,4 +1,3 @@ -from datetime import datetime, timezone import sys from typing import Generator from unittest.mock import MagicMock, patch @@ -44,13 +43,12 @@ def instance(sys_argv_instance, tmp_path): settings = Settings() settings['test1'] = 12 msg = Message(0.0, 1.0, 'message', settings) - communicator.receive_message.return_value = msg + communicator.receive_message.return_value = msg, 10.0 comm_type.return_value = communicator mmp_client_object = MagicMock() mmp_client_object.request_peers.return_value = (None, None, None) - checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None, - tmp_path) + checkpoint_info = (0.0, Checkpoints(), None, tmp_path) mmp_client_object.get_checkpoint_info.return_value = checkpoint_info mmp_client.return_value = mmp_client_object @@ -68,8 +66,7 @@ def instance2(sys_argv_instance, tmp_path): patch('libmuscle.instance.Communicator'): mmp_client_object = MagicMock() mmp_client_object.request_peers.return_value = (None, None, None) - checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None, - tmp_path) + checkpoint_info = (0.0, Checkpoints(), None, tmp_path) mmp_client_object.get_checkpoint_info.return_value = checkpoint_info mmp_client.return_value = mmp_client_object instance = Instance({ @@ -84,8 +81,7 @@ def test_create_instance( patch('libmuscle.instance.Communicator') as comm_type: mmp_client_object = MagicMock() mmp_client_object.request_peers.return_value = (None, None, None) - checkpoint_info = (datetime.now(timezone.utc), Checkpoints(), None, - tmp_path) + checkpoint_info = (0.0, Checkpoints(), None, tmp_path) mmp_client_object.get_checkpoint_info.return_value = checkpoint_info mmp_client.return_value = mmp_client_object ports = { @@ -169,9 +165,10 @@ def test_is_vector_port(instance): def test_send(instance, message): + instance._trigger_manager._cpts_considered_until = 17.0 instance.send('out', message, 1) assert instance._communicator.send_message.called_with( - 'out', message, 1) + 'out', message, 1, 17.0) def test_send_invalid_port(instance, message): @@ -242,7 +239,8 @@ def test_reuse_instance_receive_overlay(instance): test_overlay = Settings() test_overlay['test2'] = 'abc' recv = instance._communicator.receive_message - recv.return_value = Message(0.0, None, test_overlay, test_base_settings) + msg = Message(0.0, None, test_overlay, test_base_settings) + recv.return_value = msg, 0.0 instance.reuse_instance() assert instance._communicator.receive_message.called_with( 'muscle_settings_in') @@ -254,9 +252,9 @@ def test_reuse_instance_receive_overlay(instance): def test_reuse_instance_closed_port(instance): def receive_message(port_name, slot=None, default=None): if port_name == 'muscle_settings_in': - return Message(0.0, None, Settings(), Settings()) + return Message(0.0, None, Settings(), Settings()), 0.0 elif port_name == 'in': - return Message(0.0, None, ClosePort(), Settings()) + return Message(0.0, None, ClosePort(), Settings()), 1.0 assert False # pragma: no cover def get_port(port_name): @@ -282,10 +280,10 @@ def get_port(port_name): def test_reuse_instance_vector_port(instance2): def receive_message(port_name, slot=None, default=None): if port_name == 'muscle_settings_in': - return Message(0.0, None, Settings(), Settings()) + return Message(0.0, None, Settings(), Settings()), 0.0 elif port_name == 'in': data = 'test {}'.format(slot) - return Message(0.0, None, data, Settings()) + return Message(0.0, None, data, Settings()), 0.0 assert False # pragma: no cover instance2._communicator.receive_message = receive_message @@ -310,7 +308,7 @@ def receive_message(port_name, slot=None, default=None): def test_reuse_instance_no_f_init_ports(instance): instance._communicator.receive_message.return_value = Message( - 0.0, None, Settings(), Settings()) + 0.0, None, Settings(), Settings()), 0.0 instance._communicator.list_ports.return_value = {} instance._communicator.settings_in_connected.return_value = False do_reuse = instance.reuse_instance() diff --git a/libmuscle/python/libmuscle/test/test_mpp_message.py b/libmuscle/python/libmuscle/test/test_mpp_message.py index dce3ed88..aaebe351 100644 --- a/libmuscle/python/libmuscle/test/test_mpp_message.py +++ b/libmuscle/python/libmuscle/test/test_mpp_message.py @@ -15,10 +15,12 @@ def test_create() -> None: timestamp = 10.0 next_timestamp = 11.0 settings_overlay = (6789).to_bytes(2, 'little', signed=True) + message_number = 0 + saved_until = 1.6 data = (12345).to_bytes(2, 'little', signed=True) msg = MPPMessage( sender, receiver, None, timestamp, next_timestamp, - settings_overlay, 0, data) + settings_overlay, message_number, saved_until, data) assert msg.sender == sender assert msg.receiver == receiver assert msg.port_length is None @@ -26,6 +28,7 @@ def test_create() -> None: assert msg.next_timestamp == 11.0 assert msg.settings_overlay == settings_overlay assert msg.message_number == 0 + assert msg.saved_until == 1.6 assert msg.data == data @@ -44,7 +47,7 @@ def test_grid_encode() -> None: grid = Grid(array, ['x', 'y', 'z']) msg = MPPMessage( sender, receiver, None, timestamp, next_timestamp, Settings(), - 0, grid) + 0, 1.0, grid) wire_data = msg.encoded() mcp_decoded = msgpack.unpackb(wire_data, raw=False) @@ -88,6 +91,7 @@ def test_grid_decode() -> None: 'next_timestamp': None, 'settings_overlay': msgpack.ExtType(1, settings_data), 'message_number': 0, + 'saved_until': 9.9, 'data': msgpack.ExtType(2, grid_data)} wire_data = msgpack.packb(msg_dict, use_bin_type=True) @@ -137,7 +141,7 @@ def test_grid_roundtrip() -> None: grid = Grid(array, ['x', 'y', 'z']) msg = MPPMessage( sender, receiver, None, timestamp, next_timestamp, Settings(), - 0, grid) + 0, 1.0, grid) wire_data = msg.encoded() msg_out = MPPMessage.from_bytes(wire_data) @@ -171,7 +175,7 @@ def test_non_contiguous_grid_roundtrip() -> None: grid = Grid(array.real, ['a', 'b', 'c']) msg = MPPMessage( sender, receiver, None, timestamp, next_timestamp, Settings(), - 0, grid) + 0, 7.7, grid) wire_data = msg.encoded() msg_out = MPPMessage.from_bytes(wire_data) diff --git a/libmuscle/python/libmuscle/test/test_outbox.py b/libmuscle/python/libmuscle/test/test_outbox.py index cb4af31a..a2e97c40 100644 --- a/libmuscle/python/libmuscle/test/test_outbox.py +++ b/libmuscle/python/libmuscle/test/test_outbox.py @@ -19,7 +19,7 @@ def message(): Ref('sender.out'), Ref('receiver.in'), None, 0.0, 1.0, bytes(), - 0, + 0, 1.0, 'testing'.encode('utf-8')) From fc9168dd8a046694e4c0bf294cd82f657daca87b Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 10 Jan 2023 13:29:39 +0100 Subject: [PATCH 121/183] Improved API Guard error messages --- libmuscle/python/libmuscle/api_guard.py | 87 ++++++++++++++++--------- 1 file changed, 57 insertions(+), 30 deletions(-) diff --git a/libmuscle/python/libmuscle/api_guard.py b/libmuscle/python/libmuscle/api_guard.py index 1f4fbfd3..56012448 100644 --- a/libmuscle/python/libmuscle/api_guard.py +++ b/libmuscle/python/libmuscle/api_guard.py @@ -22,6 +22,9 @@ class APIPhase(Enum): we know that we should expect resume() after reuse_instance() and we use BEFORE_RESUMING accordingly. """ + BEFORE_FIRST_REUSE_INSTANCE = auto() + """Before the first time calling reuse_instance""" + BEFORE_REUSE_INSTANCE = auto() """Before calling reuse_instance""" @@ -62,11 +65,47 @@ class APIGuard: def __init__(self) -> None: """Create an APIPhaseTracker. - This starts the tracker in BEFORE_REUSE_INSTANCE. + This starts the tracker in BEFORE_FIRST_REUSE_INSTANCE. """ - self._phase = APIPhase.BEFORE_REUSE_INSTANCE + self._phase = APIPhase.BEFORE_FIRST_REUSE_INSTANCE self._uses_checkpointing = None # type: Optional[bool] + def _generic_error_messages(self, verify_phase: str) -> None: + if self._phase in ( + APIPhase.BEFORE_FIRST_REUSE_INSTANCE, + APIPhase.AFTER_REUSE_LOOP): + msg = f'Please only call {verify_phase} inside the reuse loop.' + elif self._phase == APIPhase.BEFORE_REUSE_INSTANCE: + msg = ( + 'Please do not call {verify_phase} after' + ' should_save_final_snapshot. should_save_final_snapshot' + ' should be at the end of the reuse loop.') + elif self._phase == APIPhase.AFTER_REUSE_INSTANCE: + msg = ( + 'Please call resuming first in the reuse loop, before' + f' {verify_phase}') + elif self._phase == APIPhase.BEFORE_RESUMING: + msg = 'Inside the reuse loop you must call resuming first.' + elif self._phase == APIPhase.BEFORE_LOAD_SNAPSHOT: + msg = ( + 'If resuming returns True, then you must call' + ' load_snapshot first.') + elif self._phase == APIPhase.BEFORE_SHOULD_INIT: + msg = 'After calling resuming, you must call should_init first.' + elif self._phase == APIPhase.BEFORE_SHOULD_SAVE_SNAPSHOT: + msg = 'You must call save_snapshot or save_final_snapshot first.' + elif self._phase == APIPhase.BEFORE_SAVE_SNAPSHOT: + msg = ( + 'If should_save_snapshot returns True, then you must' + ' call save_snapshot first.') + elif self._phase == APIPhase.BEFORE_SAVE_FINAL_SNAPSHOT: + msg = ( + 'If should_save_final_snapshot returns True, then you' + ' must call save_final_snapshot first.') + else: + return + raise RuntimeError(msg) + def uses_checkpointing(self) -> bool: """Return whether the code is using checkpointing. @@ -90,8 +129,13 @@ def verify_reuse_instance(self) -> None: """Check reuse_instance()""" if self._phase == APIPhase.AFTER_REUSE_INSTANCE: self._uses_checkpointing = False - elif self._phase != APIPhase.BEFORE_REUSE_INSTANCE: - raise RuntimeError() + elif self._phase not in ( + APIPhase.BEFORE_REUSE_INSTANCE, + APIPhase.BEFORE_FIRST_REUSE_INSTANCE): + raise RuntimeError( + 'We reached the end of the reuse loop without checking' + ' if a snapshot should be saved. Please add at least' + ' a should_save_final_snapshot and save_final_snapshot.') def reuse_instance_done(self, reusing: bool) -> None: """Update phase on successful reuse_instance(). @@ -154,10 +198,8 @@ def should_init_done(self) -> None: def verify_should_save_snapshot(self) -> None: """Check should_save_snapshot()""" if self._phase != APIPhase.BEFORE_SHOULD_SAVE_SNAPSHOT: - raise RuntimeError( - 'We reached the end of the reuse loop without checking' - ' if a snapshot should be saved. Please add at least' - ' a should_save_final_snapshot and save_final_snapshot.') + self._generic_error_messages('should_save_snapshot') + raise RuntimeError() # should be unreachable def should_save_snapshot_done(self, should_save: bool) -> None: """Update phase on successful should_save_snapshot(). @@ -169,9 +211,10 @@ def should_save_snapshot_done(self, should_save: bool) -> None: self._phase = APIPhase.BEFORE_SAVE_SNAPSHOT def verify_save_snapshot(self) -> None: - """Check should_save_snapshot()""" + """Check save_snapshot()""" if self._phase != APIPhase.BEFORE_SAVE_SNAPSHOT: - raise RuntimeError() + self._generic_error_messages('save_snapshot') + raise RuntimeError() # should be unreachable def save_snapshot_done(self) -> None: """Update phase on successful save_snapshot()""" @@ -180,25 +223,8 @@ def save_snapshot_done(self) -> None: def verify_should_save_final_snapshot(self) -> None: """Check should_save_final_snapshot().""" if self._phase != APIPhase.BEFORE_SHOULD_SAVE_SNAPSHOT: - if self._phase in ( - APIPhase.BEFORE_REUSE_INSTANCE, APIPhase.AFTER_REUSE_LOOP): - msg = ( - 'Please only call should_save_final_snapshot inside' - ' the reuse loop.') - elif self._phase == APIPhase.BEFORE_SAVE_FINAL_SNAPSHOT: - msg = ( - 'If should_save_final_snapshot returns True, then you' - ' must call save_final_snapshot immediately.') - elif self._phase == APIPhase.BEFORE_SAVE_SNAPSHOT: - msg = ( - 'If should_save_snapshot returns True, then you must' - ' call save_snapshot first.') - else: - msg = ( - 'Please only call should_save_final_snapshot at the' - ' end of the reuse loop.') - - raise RuntimeError(msg) + self._generic_error_messages('should_save_final_snapshot') + raise RuntimeError() # should be unreachable def should_save_final_snapshot_done(self, should_save: bool) -> None: """Update phase on successful should_save_snapshot(). @@ -214,7 +240,8 @@ def should_save_final_snapshot_done(self, should_save: bool) -> None: def verify_save_final_snapshot(self) -> None: """Check should_save_final_snapshot()""" if self._phase != APIPhase.BEFORE_SAVE_FINAL_SNAPSHOT: - raise RuntimeError() + self._generic_error_messages('save_final_snapshot') + raise RuntimeError() # should be unreachable def save_final_snapshot_done(self) -> None: """Updates state on successful save_final_snapshot()""" From ced9d99952334d158dd51b5cb72c8d889ffe919f Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 10 Jan 2023 13:53:25 +0100 Subject: [PATCH 122/183] Add logging to harmonise_wall_time --- libmuscle/python/libmuscle/checkpoint_triggers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py index 7ea2759d..6bf5f059 100644 --- a/libmuscle/python/libmuscle/checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/checkpoint_triggers.py @@ -206,6 +206,9 @@ def harmonise_wall_time(self, at_least: float) -> None: """ cur = self.elapsed_walltime() if cur < at_least: + _logger.debug( + 'Harmonise wall time: advancing clock by %f seconds', + at_least - cur) self._mono_to_elapsed += at_least - cur def snapshots_enabled(self) -> bool: From e402fdfcb50866335d6fdc3229f554652f8423e7 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 10 Jan 2023 13:55:57 +0100 Subject: [PATCH 123/183] Replace isinstance check for ImplementationState --- libmuscle/python/libmuscle/instance.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index 1f8dc1dd..8f8321f2 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -50,13 +50,7 @@ def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None, """ self.__is_shut_down = False - if not isinstance(stateful, ImplementationState): - raise ValueError( - f'Invalid value supplied for "stateful": {stateful}.' - ' Expected one of ImplementationState.STATEFUL,' - ' ImplementationState.STATELESS or ImplementationState.' - 'WEAKLY_STATEFUL.') - self._stateful = stateful + self._stateful = ImplementationState(stateful) # Note that these are accessed by Muscle3, but otherwise private. self._name, self._index = self.__make_full_name() From 64544d872611688c621c2fc310fe5a5a2f3f9acc Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Tue, 10 Jan 2023 13:57:29 +0100 Subject: [PATCH 124/183] Verify snapshot Messages are not None None messages indicate implicit snapshots, which are handled differently --- libmuscle/python/libmuscle/instance.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index 8f8321f2..bb559636 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -561,6 +561,8 @@ def save_snapshot(self, message: Message) -> None: store the internal state of the submodel. """ self._api_guard.verify_save_snapshot() + if message is None: + raise RuntimeError('Please specify a Message to save as snapshot.') self._save_snapshot(message, False) self._api_guard.save_snapshot_done() @@ -627,6 +629,8 @@ def save_final_snapshot(self, message: Message) -> None: submodel. """ self._api_guard.verify_save_final_snapshot() + if message is None: + raise RuntimeError('Please specify a Message to save as snapshot.') self._save_snapshot(message, True, self.__f_init_max_timestamp) self._api_guard.save_final_snapshot_done() From 4f7f0108b4fff3ab2a0fc454c05f5cc90d137859 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 5 Dec 2022 14:13:02 +0100 Subject: [PATCH 125/183] Fix should_save_final_snapshot when not reusing ClosePort messages have `inf` timestamps, so would always trigger a final snapshot. Only expected when `at_end` checkpoints should be taken. --- libmuscle/python/libmuscle/checkpoint_triggers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py index 6bf5f059..b0064c75 100644 --- a/libmuscle/python/libmuscle/checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/checkpoint_triggers.py @@ -233,9 +233,10 @@ def should_save_final_snapshot( return False value = False - if not do_reuse and self._checkpoint_at_end: - value = True - self._last_triggers.append('at_end') + if not do_reuse: + if self._checkpoint_at_end: + value = True + self._last_triggers.append('at_end') elif f_init_max_timestamp is None: # No F_INIT messages received: reuse triggered on muscle_settings_in # message. From 8a9538f25cd90f6e87068d011f7dddcd8c2feba9 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Tue, 10 Jan 2023 21:58:57 +0100 Subject: [PATCH 126/183] Incorporate latest yMMSL changes --- .../test_snapshot_complex_coupling.py | 4 ++-- integration_test/test_snapshot_dispatch.py | 4 ++-- integration_test/test_snapshot_macro_micro.py | 6 +++--- libmuscle/python/libmuscle/instance.py | 21 ++++++++++--------- .../manager/test/test_snapshot_registry.py | 18 ++++++++-------- 5 files changed, 27 insertions(+), 26 deletions(-) diff --git a/integration_test/test_snapshot_complex_coupling.py b/integration_test/test_snapshot_complex_coupling.py index a75a89ce..dad2ee34 100644 --- a/integration_test/test_snapshot_complex_coupling.py +++ b/integration_test/test_snapshot_complex_coupling.py @@ -2,7 +2,7 @@ import time import pytest -from ymmsl import ImplementationState, Operator, load, dump +from ymmsl import KeepsStateForNextUse, Operator, load, dump from libmuscle import Instance, Message from libmuscle.manager.run_dir import RunDir @@ -57,7 +57,7 @@ def cache_component(max_channels=2): def echo_component(max_channels=2): ports = {Operator.F_INIT: [f'in{i+1}' for i in range(max_channels)], Operator.O_F: [f'out{i+1}' for i in range(max_channels)]} - instance = Instance(ports, stateful=ImplementationState.STATELESS) + instance = Instance(ports, keeps_state_for_next_use=KeepsStateForNextUse.NO) while instance.reuse_instance(): for p_in, p_out in zip(ports[Operator.F_INIT], ports[Operator.O_F]): diff --git a/integration_test/test_snapshot_dispatch.py b/integration_test/test_snapshot_dispatch.py index 106f6d3c..7102a43c 100644 --- a/integration_test/test_snapshot_dispatch.py +++ b/integration_test/test_snapshot_dispatch.py @@ -1,5 +1,5 @@ import pytest -from ymmsl import ImplementationState, Operator, load, dump +from ymmsl import KeepsStateForNextUse, Operator, load, dump from libmuscle import Instance, Message from libmuscle.manager.run_dir import RunDir @@ -47,7 +47,7 @@ def stateless_component(): instance = Instance({ Operator.F_INIT: ['f_i'], Operator.O_F: ['o_f']}, - stateful=ImplementationState.STATELESS) + stateful=KeepsStateForNextUse.NO) while instance.reuse_instance(): dt = instance.get_setting('dt', 'float') diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py index f8b11cb4..885ac704 100644 --- a/integration_test/test_snapshot_macro_micro.py +++ b/integration_test/test_snapshot_macro_micro.py @@ -1,5 +1,5 @@ import pytest -from ymmsl import ImplementationState, Operator, load, dump +from ymmsl import KeepsStateForNextUse, Operator, load, dump from libmuscle import Instance, Message from libmuscle.manager.run_dir import RunDir @@ -127,7 +127,7 @@ def stateless_micro(): instance = Instance({ Operator.F_INIT: ['f_i'], Operator.O_F: ['o_f']}, - stateful=ImplementationState.STATELESS) + keeps_state_for_next_use=KeepsStateForNextUse.NO) while instance.reuse_instance(): dt = instance.get_setting('dt', 'float') @@ -149,7 +149,7 @@ def data_transformer(): instance = Instance({ Operator.F_INIT: ['f_i'], Operator.O_F: ['o_f']}, - stateful=ImplementationState.STATELESS) + keeps_state_for_next_use=KeepsStateForNextUse.NO) while instance.reuse_instance(): msg = instance.receive('f_i') diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index bb559636..e4e685f7 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -7,7 +7,7 @@ from typing_extensions import Literal from ymmsl import (Identifier, Operator, SettingValue, Port, Reference, - Settings, ImplementationState) + Settings, KeepsStateForNextUse) from libmuscle.api_guard import APIGuard from libmuscle.checkpoint_triggers import TriggerManager @@ -35,22 +35,23 @@ class Instance: This class provides a low-level send/receive API for the instance to use. """ - def __init__(self, ports: Optional[Dict[Operator, List[str]]] = None, - stateful: ImplementationState = ImplementationState.STATEFUL - ) -> None: + def __init__( + self, ports: Optional[Dict[Operator, List[str]]] = None, + keeps_state_for_next_use: KeepsStateForNextUse + = KeepsStateForNextUse.NECESSARY) -> None: """Create an Instance. Args: ports: A list of port names for each :external:py:class:`~ymmsl.Operator` of this component. - stateful: Indicate whether this instance carries state between - iterations of the reuse loop. See - :external:py:class:`ymmsl.ImplementationState` for a description - of the options. + keeps_state_for_next_use: Indicate whether this instance carries + state between iterations of the reuse loop. See + :external:py:class:`ymmsl.KeepsStateForNextUse` for a + description of the options. """ self.__is_shut_down = False - self._stateful = ImplementationState(stateful) + self._keeps_state = KeepsStateForNextUse(keeps_state_for_next_use) # Note that these are accessed by Muscle3, but otherwise private. self._name, self._index = self.__make_full_name() @@ -183,7 +184,7 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool: do_implicit_checkpoint = ( not self._first_run and not self._api_guard.uses_checkpointing() and - self._stateful is not ImplementationState.STATEFUL) + self._keeps_state is not KeepsStateForNextUse.NECESSARY) if do_implicit_checkpoint: if self._trigger_manager.should_save_final_snapshot( diff --git a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py index 6b9838e6..2d71630b 100644 --- a/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py +++ b/libmuscle/python/libmuscle/manager/test/test_snapshot_registry.py @@ -5,7 +5,7 @@ import pytest from ymmsl import ( Configuration, Model, Component, Conduit, Implementation, - ImplementationState as IState, Reference) + KeepsStateForNextUse, Reference) from libmuscle.manager.snapshot_registry import ( SnapshotNode, SnapshotRegistry, calc_consistency, calc_consistency_list, @@ -35,14 +35,14 @@ def macro_micro(micro_is_stateless: bool) -> Configuration: if micro_is_stateless: micro_impl = Implementation( - 'micro_impl', stateful=IState.STATELESS, executable='pass') + 'micro_impl', + keeps_state_for_next_use=KeepsStateForNextUse.NO, + executable='pass') else: - micro_impl = Implementation( - 'micro_impl', supports_checkpoint=True, executable='pass') + micro_impl = Implementation('micro_impl', executable='pass') implementations = [ - Implementation( - 'macro_impl', supports_checkpoint=True, executable='pass'), + Implementation('macro_impl', executable='pass'), micro_impl] return Configuration(model, implementations=implementations) @@ -60,9 +60,9 @@ def uq(macro_micro: Configuration) -> Configuration: Conduit('rr.back_out', 'macro.muscle_settings_in'), Conduit('macro.final_state_out', 'rr.back_in')]) macro_micro.implementations[Reference('qmc_impl')] = Implementation( - 'qmc_impl', supports_checkpoint=True, executable='pass') + 'qmc_impl', executable='pass') macro_micro.implementations[Reference('rr_impl')] = Implementation( - 'rr_impl', supports_checkpoint=True, executable='pass') + 'rr_impl', executable='pass') return macro_micro @@ -381,7 +381,7 @@ def test_heuristic_rollbacks() -> None: conduits = [Conduit(f'comp{i}.o_f', f'comp{i+1}.f_i') for i in range(3)] model = Model('linear', components, conduits) implementations = [ - Implementation(f'impl{i}', supports_checkpoint=True, script='xyz') + Implementation(f'impl{i}', script='xyz') for i in range(4)] config = Configuration(model, implementations=implementations) From 4dfbd56277864346d23744e8bab65d970947d70b Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Wed, 11 Jan 2023 10:03:20 +0100 Subject: [PATCH 127/183] Update tox.ini to refer to ymmsl@develop branch --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 006e8901..020ee9e2 100644 --- a/tox.ini +++ b/tox.ini @@ -8,7 +8,7 @@ deps = flake8<6 pytest pytest-cov - git+https://github.com/multiscale/ymmsl-python.git@feature/checkpointing#egg=ymmsl + git+https://github.com/multiscale/ymmsl-python.git@develop#egg=ymmsl passenv = MUSCLE_TEST_PYTHON_ONLY From bc43c516ec82c9ea0ec369876f0fc8aa1edb8ae0 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Wed, 11 Jan 2023 10:04:17 +0100 Subject: [PATCH 128/183] Fix mypy error (np.bool8 -> np.bool_) Fixes #147 --- libmuscle/python/libmuscle/mpp_message.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libmuscle/python/libmuscle/mpp_message.py b/libmuscle/python/libmuscle/mpp_message.py index c57effb3..976f9f5e 100644 --- a/libmuscle/python/libmuscle/mpp_message.py +++ b/libmuscle/python/libmuscle/mpp_message.py @@ -93,7 +93,7 @@ def _decode_grid(code: int, data: bytes) -> Grid: ExtTypeId.GRID_INT64: np.int64, ExtTypeId.GRID_FLOAT32: np.float32, ExtTypeId.GRID_FLOAT64: np.float64, - ExtTypeId.GRID_BOOL: np.bool8} + ExtTypeId.GRID_BOOL: np.bool_} order_map = { 'fa': 'F', From 6a9fea67d16f1088540ea3a9f63bcd769403a293 Mon Sep 17 00:00:00 2001 From: Lourens Veen Date: Wed, 11 Jan 2023 11:06:48 +0100 Subject: [PATCH 129/183] Use ymmsl@develop for the examples as well --- docs/source/examples/python/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/examples/python/requirements.txt b/docs/source/examples/python/requirements.txt index fa14df52..cabe0c71 100644 --- a/docs/source/examples/python/requirements.txt +++ b/docs/source/examples/python/requirements.txt @@ -3,6 +3,6 @@ numpy<1.22; python_version=='3.7' numpy>=1.22,<=1.25; python_version>='3.8' sobol_seq==0.2.0 yatiml==0.9.0 -ymmsl>=0.12.0,<0.13 +git+https://github.com/multiscale/ymmsl-python.git@develop#egg=ymmsl qcg-pilotjob==0.13.1 From db4659224215987582284777701f4967791be1b7 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Wed, 11 Jan 2023 13:55:48 +0100 Subject: [PATCH 130/183] Remove `snapshots_enabled` API call --- libmuscle/python/libmuscle/checkpoint_triggers.py | 5 ----- libmuscle/python/libmuscle/instance.py | 12 ------------ 2 files changed, 17 deletions(-) diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py index b0064c75..d639f48a 100644 --- a/libmuscle/python/libmuscle/checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/checkpoint_triggers.py @@ -211,11 +211,6 @@ def harmonise_wall_time(self, at_least: float) -> None: at_least - cur) self._mono_to_elapsed += at_least - cur - def snapshots_enabled(self) -> bool: - """Check if the current workflow has snapshots enabled. - """ - return self._has_checkpoints - def should_save_snapshot(self, timestamp: float) -> bool: """Handles instance.should_save_snapshot """ diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index e4e685f7..732f6919 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -446,18 +446,6 @@ def receive_with_settings( """ return self.__receive_message(port_name, slot, default, True) - def snapshots_enabled(self) -> bool: - """Check if the current workflow has snapshots enabled. - - When snapshots are not enabled, all calls to - :meth:`should_save_snapshot` and :meth:`should_save_final_snapshot` will - return False. - - Returns: - True iff checkpoint rules are defined in the workflow yMMSL. - """ - return self._trigger_manager.snapshots_enabled() - def resuming(self) -> bool: """Check if this instance is resuming from a snapshot. From d9c395057426592ff85538b12279f145a35c395c Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Wed, 11 Jan 2023 14:14:22 +0100 Subject: [PATCH 131/183] Remove "sim_reset" logic --- libmuscle/python/libmuscle/checkpoint_triggers.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/libmuscle/python/libmuscle/checkpoint_triggers.py b/libmuscle/python/libmuscle/checkpoint_triggers.py index d639f48a..434f5a5d 100644 --- a/libmuscle/python/libmuscle/checkpoint_triggers.py +++ b/libmuscle/python/libmuscle/checkpoint_triggers.py @@ -189,7 +189,6 @@ def set_checkpoint_info( self._sim = CombinedCheckpointTriggers(checkpoints.simulation_time) self._prevsim = None # type: Optional[float] self._nextsim = None # type: Optional[float] - self._sim_reset = True def elapsed_walltime(self) -> float: """Returns elapsed wallclock_time in seconds. @@ -237,7 +236,6 @@ def should_save_final_snapshot( # message. _logger.debug('Reuse triggered by muscle_settings_in.' ' Not creating a snapshot.') - self._sim_reset = True else: value = self.__should_save(f_init_max_timestamp) @@ -256,10 +254,6 @@ def update_checkpoints(self, timestamp: float) -> None: self._prevsim = timestamp self._nextsim = self._sim.next_checkpoint(timestamp) - # this method is also called during resume, after which we no longer - # consider the simulation_time as reset - self._sim_reset = False - def get_triggers(self) -> List[str]: """Get trigger description(s) for the current reason for checkpointing. """ @@ -273,7 +267,7 @@ def __should_save(self, simulation_time: float) -> bool: Args: simulation_time: current/next timestamp as reported by the instance """ - if self._sim_reset: + if self._nextsim is None and self._prevsim is None: # we cannot make assumptions about the start time of a simulation, # a t=-1000 could make sense if t represents years since CE # and we should not disallow checkpointing for negative t @@ -284,7 +278,6 @@ def __should_save(self, simulation_time: float) -> bool: self._nextsim = previous else: self._nextsim = self._sim.next_checkpoint(simulation_time) - self._sim_reset = False walltime = self.elapsed_walltime() self._cpts_considered_until = walltime From 9553c9eaba28a838fc8a6a18af360d11db8436ec Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Wed, 11 Jan 2023 17:10:47 +0100 Subject: [PATCH 132/183] Implement InstanceFlags in Python Deprecates supplying `apply_overlay` in Instance.reuse_instance. Note: UQ examples are not yet updated. --- .../examples/python/interact_coupling.py | 4 +- integration_test/test_parameter_overlays.py | 7 +- .../test_snapshot_complex_coupling.py | 11 +- integration_test/test_snapshot_dispatch.py | 9 +- integration_test/test_snapshot_interact.py | 4 +- integration_test/test_snapshot_macro_micro.py | 15 ++- libmuscle/python/libmuscle/__init__.py | 11 +- libmuscle/python/libmuscle/api_guard.py | 47 ++----- libmuscle/python/libmuscle/instance.py | 120 +++++++++++++----- libmuscle/python/libmuscle/test/conftest.py | 2 +- .../python/libmuscle/test/test_api_guard.py | 20 ++- 11 files changed, 144 insertions(+), 106 deletions(-) diff --git a/docs/source/examples/python/interact_coupling.py b/docs/source/examples/python/interact_coupling.py index 3df5e11e..d0e6a836 100644 --- a/docs/source/examples/python/interact_coupling.py +++ b/docs/source/examples/python/interact_coupling.py @@ -1,7 +1,7 @@ import logging from typing import Any, Optional, Tuple, Dict -from libmuscle import Instance, Message +from libmuscle import Instance, Message, USES_CHECKPOINT_API from libmuscle.runner import run_simulation from ymmsl import ( Component, Conduit, Configuration, Model, Operator, Ports, Settings) @@ -275,7 +275,7 @@ def checkpointing_temporal_coupler() -> None: """ instance = Instance({ Operator.O_I: ['a_out', 'b_out'], - Operator.S: ['a_in', 'b_in']}) + Operator.S: ['a_in', 'b_in']}, USES_CHECKPOINT_API) while instance.reuse_instance(): if instance.resuming(): diff --git a/integration_test/test_parameter_overlays.py b/integration_test/test_parameter_overlays.py index cf091193..5b772d10 100644 --- a/integration_test/test_parameter_overlays.py +++ b/integration_test/test_parameter_overlays.py @@ -3,7 +3,7 @@ from ymmsl import (Component, Conduit, Configuration, Model, Operator, Settings) -from libmuscle import Instance, Message +from libmuscle import Instance, Message, DONT_APPLY_OVERLAY from libmuscle.runner import run_simulation @@ -49,9 +49,10 @@ def explicit_relay(): having MUSCLE handle them. This just passes all information on. """ instance = Instance({ - Operator.F_INIT: ['in[]'], Operator.O_F: ['out[]']}) + Operator.F_INIT: ['in[]'], Operator.O_F: ['out[]']}, + DONT_APPLY_OVERLAY) - while instance.reuse_instance(False): + while instance.reuse_instance(): # f_init assert instance.get_setting('test2', 'float') == 13.3 assert instance.get_port_length('in') == instance.get_port_length( diff --git a/integration_test/test_snapshot_complex_coupling.py b/integration_test/test_snapshot_complex_coupling.py index dad2ee34..e3b408de 100644 --- a/integration_test/test_snapshot_complex_coupling.py +++ b/integration_test/test_snapshot_complex_coupling.py @@ -2,9 +2,10 @@ import time import pytest -from ymmsl import KeepsStateForNextUse, Operator, load, dump +from ymmsl import Operator, load, dump -from libmuscle import Instance, Message +from libmuscle import ( + Instance, Message, HAS_NO_STATE_FOR_NEXT_USE, USES_CHECKPOINT_API) from libmuscle.manager.run_dir import RunDir from .conftest import run_manager_with_actors, ls_snapshots @@ -18,7 +19,7 @@ def cache_component(max_channels=2): Operator.O_I: [f'sub_out{i+1}' for i in range(max_channels)], Operator.S: [f'sub_in{i+1}' for i in range(max_channels)], Operator.O_F: [f'out{i+1}' for i in range(max_channels)]} - instance = Instance(ports) + instance = Instance(ports, USES_CHECKPOINT_API) cache_t = float('-inf') cache_data = [] @@ -57,7 +58,7 @@ def cache_component(max_channels=2): def echo_component(max_channels=2): ports = {Operator.F_INIT: [f'in{i+1}' for i in range(max_channels)], Operator.O_F: [f'out{i+1}' for i in range(max_channels)]} - instance = Instance(ports, keeps_state_for_next_use=KeepsStateForNextUse.NO) + instance = Instance(ports, HAS_NO_STATE_FOR_NEXT_USE) while instance.reuse_instance(): for p_in, p_out in zip(ports[Operator.F_INIT], ports[Operator.O_F]): @@ -69,7 +70,7 @@ def main_component(): instance = Instance({ Operator.O_I: ['state_out'], Operator.S: ['Ai', 'Bi', 'Ci', 'Di'], - Operator.O_F: ['o_f']}) + Operator.O_F: ['o_f']}, USES_CHECKPOINT_API) while instance.reuse_instance(): dt = instance.get_setting('dt', 'float') diff --git a/integration_test/test_snapshot_dispatch.py b/integration_test/test_snapshot_dispatch.py index 7102a43c..f604663e 100644 --- a/integration_test/test_snapshot_dispatch.py +++ b/integration_test/test_snapshot_dispatch.py @@ -1,7 +1,8 @@ import pytest -from ymmsl import KeepsStateForNextUse, Operator, load, dump +from ymmsl import Operator, load, dump -from libmuscle import Instance, Message +from libmuscle import ( + Instance, Message, HAS_NO_STATE_FOR_NEXT_USE, USES_CHECKPOINT_API) from libmuscle.manager.run_dir import RunDir from .conftest import run_manager_with_actors, ls_snapshots @@ -13,7 +14,7 @@ def component(): instance = Instance({ Operator.F_INIT: ['f_i'], - Operator.O_F: ['o_f']}) + Operator.O_F: ['o_f']}, USES_CHECKPOINT_API) while instance.reuse_instance(): dt = instance.get_setting('dt', 'float') @@ -47,7 +48,7 @@ def stateless_component(): instance = Instance({ Operator.F_INIT: ['f_i'], Operator.O_F: ['o_f']}, - stateful=KeepsStateForNextUse.NO) + HAS_NO_STATE_FOR_NEXT_USE) while instance.reuse_instance(): dt = instance.get_setting('dt', 'float') diff --git a/integration_test/test_snapshot_interact.py b/integration_test/test_snapshot_interact.py index 5492f9e2..1fc2a5d8 100644 --- a/integration_test/test_snapshot_interact.py +++ b/integration_test/test_snapshot_interact.py @@ -5,7 +5,7 @@ import pytest from ymmsl import Operator, load, dump -from libmuscle import Instance, Message +from libmuscle import Instance, Message, USES_CHECKPOINT_API from libmuscle.manager.run_dir import RunDir from .conftest import run_manager_with_actors, ls_snapshots @@ -21,7 +21,7 @@ def component(): instance = Instance({ Operator.O_I: ['o_i'], - Operator.S: ['s']}) + Operator.S: ['s']}, USES_CHECKPOINT_API) while instance.reuse_instance(): t0 = instance.get_setting('t0', 'float') diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py index 885ac704..2e660606 100644 --- a/integration_test/test_snapshot_macro_micro.py +++ b/integration_test/test_snapshot_macro_micro.py @@ -1,7 +1,8 @@ import pytest -from ymmsl import KeepsStateForNextUse, Operator, load, dump +from ymmsl import Operator, load, dump -from libmuscle import Instance, Message +from libmuscle import ( + Instance, Message, HAS_NO_STATE_FOR_NEXT_USE, USES_CHECKPOINT_API) from libmuscle.manager.run_dir import RunDir from .conftest import run_manager_with_actors, ls_snapshots @@ -13,7 +14,7 @@ def macro(): instance = Instance({ Operator.O_I: ['o_i'], - Operator.S: ['s']}) + Operator.S: ['s']}, USES_CHECKPOINT_API) while instance.reuse_instance(): dt = instance.get_setting('dt', 'float') @@ -52,7 +53,7 @@ def macro(): def macro_vector(): instance = Instance({ Operator.O_I: ['o_i[]'], - Operator.S: ['s[]']}) + Operator.S: ['s[]']}, USES_CHECKPOINT_API) while instance.reuse_instance(): dt = instance.get_setting('dt', 'float') @@ -93,7 +94,7 @@ def macro_vector(): def micro(): instance = Instance({ Operator.F_INIT: ['f_i'], - Operator.O_F: ['o_f']}) + Operator.O_F: ['o_f']}, USES_CHECKPOINT_API) while instance.reuse_instance(): dt = instance.get_setting('dt', 'float') @@ -127,7 +128,7 @@ def stateless_micro(): instance = Instance({ Operator.F_INIT: ['f_i'], Operator.O_F: ['o_f']}, - keeps_state_for_next_use=KeepsStateForNextUse.NO) + HAS_NO_STATE_FOR_NEXT_USE) while instance.reuse_instance(): dt = instance.get_setting('dt', 'float') @@ -149,7 +150,7 @@ def data_transformer(): instance = Instance({ Operator.F_INIT: ['f_i'], Operator.O_F: ['o_f']}, - keeps_state_for_next_use=KeepsStateForNextUse.NO) + HAS_NO_STATE_FOR_NEXT_USE) while instance.reuse_instance(): msg = instance.receive('f_i') diff --git a/libmuscle/python/libmuscle/__init__.py b/libmuscle/python/libmuscle/__init__.py index 707dbdae..49653a02 100644 --- a/libmuscle/python/libmuscle/__init__.py +++ b/libmuscle/python/libmuscle/__init__.py @@ -1,6 +1,6 @@ from libmuscle.communicator import Message from libmuscle.grid import Grid -from libmuscle.instance import Instance +from libmuscle.instance import Instance, InstanceFlags from libmuscle.version import __version__ from libmuscle import runner @@ -8,4 +8,11 @@ # Note that libmuscle.version above is created by the build system; it's okay # that it's not present. -__all__ = ['__version__', 'Grid', 'Instance', 'Message', 'runner'] +__all__ = [ + '__version__', 'Grid', 'Instance', 'InstanceFlags', 'Message', 'runner'] + + +# export InstanceFlag members to the module namespace +# adapted from https://github.com/python/cpython/blob/3.10/Lib/re.py#L179 +globals().update(InstanceFlags.__members__) +__all__.extend(InstanceFlags.__members__) diff --git a/libmuscle/python/libmuscle/api_guard.py b/libmuscle/python/libmuscle/api_guard.py index 56012448..6fa7ee51 100644 --- a/libmuscle/python/libmuscle/api_guard.py +++ b/libmuscle/python/libmuscle/api_guard.py @@ -1,5 +1,4 @@ from enum import auto, Enum -from typing import Optional class APIPhase(Enum): @@ -28,9 +27,6 @@ class APIPhase(Enum): BEFORE_REUSE_INSTANCE = auto() """Before calling reuse_instance""" - AFTER_REUSE_INSTANCE = auto() - """At the top of the reuse loop""" - BEFORE_RESUMING = auto() """Between reuse_instance and resuming""" @@ -62,13 +58,13 @@ class APIGuard: called to signal that the corresponding function finished successfully, and that we are moving on to the next phase. """ - def __init__(self) -> None: + def __init__(self, uses_checkpointing: bool) -> None: """Create an APIPhaseTracker. This starts the tracker in BEFORE_FIRST_REUSE_INSTANCE. """ self._phase = APIPhase.BEFORE_FIRST_REUSE_INSTANCE - self._uses_checkpointing = None # type: Optional[bool] + self._uses_checkpointing = uses_checkpointing def _generic_error_messages(self, verify_phase: str) -> None: if self._phase in ( @@ -80,10 +76,6 @@ def _generic_error_messages(self, verify_phase: str) -> None: 'Please do not call {verify_phase} after' ' should_save_final_snapshot. should_save_final_snapshot' ' should be at the end of the reuse loop.') - elif self._phase == APIPhase.AFTER_REUSE_INSTANCE: - msg = ( - 'Please call resuming first in the reuse loop, before' - f' {verify_phase}') elif self._phase == APIPhase.BEFORE_RESUMING: msg = 'Inside the reuse loop you must call resuming first.' elif self._phase == APIPhase.BEFORE_LOAD_SNAPSHOT: @@ -106,30 +98,9 @@ def _generic_error_messages(self, verify_phase: str) -> None: return raise RuntimeError(msg) - def uses_checkpointing(self) -> bool: - """Return whether the code is using checkpointing. - - We can only determine that the code doesn't use checkpointing - if there are no checkpointing calls between the first and - second calls to reuse_instance. So this function should only - be called after the second call to verify_reuse_instance, or - it may raise if the code does not use checkpointing. - - Raises: - RuntimeError: if we are at a point where we cannot know - the answer yet. - """ - if self._uses_checkpointing is not None: - return self._uses_checkpointing - raise RuntimeError( - 'The API was implemented incorrectly, please consult the' - ' documentation.') - def verify_reuse_instance(self) -> None: """Check reuse_instance()""" - if self._phase == APIPhase.AFTER_REUSE_INSTANCE: - self._uses_checkpointing = False - elif self._phase not in ( + if self._phase not in ( APIPhase.BEFORE_REUSE_INSTANCE, APIPhase.BEFORE_FIRST_REUSE_INSTANCE): raise RuntimeError( @@ -146,17 +117,19 @@ def reuse_instance_done(self, reusing: bool) -> None: if not reusing: self._phase = APIPhase.AFTER_REUSE_LOOP else: - if self._uses_checkpointing is None: - self._phase = APIPhase.AFTER_REUSE_INSTANCE - elif self._uses_checkpointing: + if self._uses_checkpointing: self._phase = APIPhase.BEFORE_RESUMING else: self._phase = APIPhase.BEFORE_REUSE_INSTANCE def verify_resuming(self) -> None: """Check resuming()""" - if self._phase not in ( - APIPhase.BEFORE_RESUMING, APIPhase.AFTER_REUSE_INSTANCE): + if not self._uses_checkpointing: + raise RuntimeError( + 'Please add the flag' + ' :attr:`InstanceFlag.USES_CHECKPOINT_API` to your' + ' instance to use the MUSCLE3 checkpointing API.') + if self._phase != APIPhase.BEFORE_RESUMING: raise RuntimeError( 'Please call resuming() only as the first thing in the' ' reuse loop.') diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index e4e685f7..406dcdbd 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -1,13 +1,15 @@ from copy import copy +from enum import Flag, auto import logging import os import sys from typing import cast, Dict, List, Optional, Tuple, overload # TODO: import from typing module when dropping support for python 3.7 from typing_extensions import Literal +import warnings from ymmsl import (Identifier, Operator, SettingValue, Port, Reference, - Settings, KeepsStateForNextUse) + Settings) from libmuscle.api_guard import APIGuard from libmuscle.checkpoint_triggers import TriggerManager @@ -29,6 +31,62 @@ _FInitCacheType = Dict[Tuple[str, Optional[int]], Message] +class InstanceFlags(Flag): + """Enumeration of properties that an instance may have. + + You may combine multiple flags using the bitwise OR operator `|`. For + example: + + .. code-block:: python + + from libmuscle import ( + Instance, USES_CHECKPOINT_API, KEEPS_STATE_FOR_NEXT_USE) + + ports = ... + flags = USES_CHECKPOINT_API | KEEPS_STATE_FOR_NEXT_USE + instance = Instance(ports, flags) + """ + + DONT_APPLY_OVERLAY = auto() + """Do not apply the received settings overlay during prereceive of F_INIT + messages. If you're going to use :meth:`Instance.receive_with_settings` on + your F_INIT ports, you need to set this flag when creating an + :class:`Instance`. + + If you don't know what that means, do not specify this flag and everything + will be fine. If it turns out that you did need to specify the flag, MUSCLE3 + will tell you about it in an error message and you can add it still. + """ + + USES_CHECKPOINT_API = auto() + """Indicate that this instance supports checkpointing. + + You may not use any checkpointing API calls when this flag is not supplied. + """ + + HAS_NO_STATE_FOR_NEXT_USE = auto() + """Indicate this instance does not carry state between iterations of the + reuse loop. + + This corresponds to :external:py:attr:`ymmsl.KeepsStateForNextUse.NO`. + + If neither :attr:`HAS_NO_STATE_FOR_NEXT_USE` and + :attr:`STATE_FOR_NEXT_USE_NOT_REQUIRED` are supplied, this corresponds to + :external:py:attr:`ymmsl.KeepsStateForNextUse.REQUIRED`. + """ + + STATE_FOR_NEXT_USE_NOT_REQUIRED = auto() + """Indicate this instance carries state between iterations of the + reuse loop, however this state is not required for restarting. + + This corresponds to :external:py:attr:`ymmsl.KeepsStateForNextUse.HELPFUL`. + + If neither :attr:`HAS_NO_STATE_FOR_NEXT_USE` and + :attr:`STATE_FOR_NEXT_USE_NOT_REQUIRED` are supplied, this corresponds to + :external:py:attr:`ymmsl.KeepsStateForNextUse.REQUIRED`. + """ + + class Instance: """Represents a component instance in a MUSCLE3 simulation. @@ -37,21 +95,19 @@ class Instance: """ def __init__( self, ports: Optional[Dict[Operator, List[str]]] = None, - keeps_state_for_next_use: KeepsStateForNextUse - = KeepsStateForNextUse.NECESSARY) -> None: + flags: InstanceFlags = InstanceFlags(0)) -> None: """Create an Instance. Args: ports: A list of port names for each :external:py:class:`~ymmsl.Operator` of this component. - keeps_state_for_next_use: Indicate whether this instance carries - state between iterations of the reuse loop. See - :external:py:class:`ymmsl.KeepsStateForNextUse` for a - description of the options. + flags: Indicate properties for this instance. See + :py:class:`InstanceFlags` for a detailed description of possible + flags. """ self.__is_shut_down = False - self._keeps_state = KeepsStateForNextUse(keeps_state_for_next_use) + self._flags = InstanceFlags(flags) # Note that these are accessed by Muscle3, but otherwise private. self._name, self._index = self.__make_full_name() @@ -63,7 +119,8 @@ def __init__( self.__set_up_logging() - self._api_guard = APIGuard() + self._api_guard = APIGuard( + InstanceFlags.USES_CHECKPOINT_API in self._flags) """Checks that the user uses the API correctly.""" self._profiler = Profiler(self._instance_name(), self.__manager) @@ -126,7 +183,7 @@ def __init__( self._set_local_log_level() self._set_remote_log_level() - def reuse_instance(self, apply_overlay: bool = True) -> bool: + def reuse_instance(self, apply_overlay: Optional[bool] = None) -> bool: """Decide whether to run this instance again. In a multiscale simulation, instances get reused all the time. @@ -149,16 +206,6 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool: i.e. before the F_INIT operator, and its return value should decide whether to enter that loop again. - Args: - apply_overlay: Whether to apply the received settings - overlay or to save it. If you're going to use - :meth:`receive_with_settings` on your F_INIT ports, - set this to False. If you don't know what that means, - just call :meth:`reuse_instance()` without specifying this - and everything will be fine. If it turns out that you - did need to specify False, MUSCLE3 will tell you about - it in an error message and you can add it still. - Raises: RuntimeError: When implementing the checkpointing API, but libmuscle detected @@ -183,8 +230,9 @@ def reuse_instance(self, apply_overlay: bool = True) -> bool: do_implicit_checkpoint = ( not self._first_run and - not self._api_guard.uses_checkpointing() and - self._keeps_state is not KeepsStateForNextUse.NECESSARY) + InstanceFlags.USES_CHECKPOINT_API not in self._flags and + (InstanceFlags.STATE_FOR_NEXT_USE_NOT_REQUIRED in self._flags or + InstanceFlags.HAS_NO_STATE_FOR_NEXT_USE in self._flags)) if do_implicit_checkpoint: if self._trigger_manager.should_save_final_snapshot( @@ -567,7 +615,7 @@ def save_snapshot(self, message: Message) -> None: self._save_snapshot(message, False) self._api_guard.save_snapshot_done() - def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool: + def should_save_final_snapshot(self) -> bool: """Check if a snapshot should be saved at the end of the reuse loop. This method checks if a snapshot should be saved now. @@ -601,7 +649,7 @@ def should_save_final_snapshot(self, *, apply_overlay: bool = True) -> bool: """ self._api_guard.verify_should_save_final_snapshot() - self._do_reuse = self._decide_reuse_instance(apply_overlay) + self._do_reuse = self._decide_reuse_instance() result = self._trigger_manager.should_save_final_snapshot( self._do_reuse, self.__f_init_max_timestamp) @@ -718,7 +766,8 @@ def __set_up_logging(self) -> None: self.__manager) logging.getLogger().addHandler(self._mmp_handler) - def _decide_reuse_instance(self, apply_overlay: bool) -> bool: + def _decide_reuse_instance( + self, apply_overlay: Optional[bool] = None) -> bool: """Decide whether and how to reuse the instance. This sets self._first_run, self._do_resume and self._do_init, and @@ -801,10 +850,11 @@ def __receive_message( if with_settings and msg.settings is None: err_msg = ('If you use receive_with_settings()' ' on an F_INIT port, then you have to' - ' pass apply_overlay=False to reuse_instance() ' - ' and should_save_final_snapshot(),' - ' if applicable, otherwise the settings will' - ' already have been applied by MUSCLE.') + ' set the flag' + ' :attr:`InstanceFlag.DONT_APPLY_OVERLAY` when' + ' creating the :class:`Instance`, otherwise the' + ' settings will already have been applied by' + ' MUSCLE.') self.__shutdown(err_msg) raise RuntimeError(err_msg) else: @@ -922,7 +972,7 @@ def _have_f_init_connections(self) -> bool: for port in ports.get(Operator.F_INIT, [])]) return f_init_connected or self._communicator.settings_in_connected() - def _pre_receive(self, apply_overlay: bool) -> bool: + def _pre_receive(self, apply_overlay: Optional[bool]) -> bool: """Pre-receives on all ports. This includes muscle_settings_in and all user-defined ports. @@ -965,12 +1015,20 @@ def __receive_settings(self) -> bool: self._trigger_manager.harmonise_wall_time(saved_until) return True - def __pre_receive_f_init(self, apply_overlay: bool) -> None: + def __pre_receive_f_init(self, apply_overlay: Optional[bool]) -> None: """Receives on all ports connected to F_INIT. This receives all incoming messages on F_INIT and stores them in self._f_init_cache. """ + if apply_overlay is not None: + warnings.warn( + 'Explicitly providing apply_overlay in reuse_instance is' + ' deprecated. Use InstanceFlags.DONT_APPLY_OVERLAY when' + ' creating the instance instead.', DeprecationWarning) + else: + apply_overlay = InstanceFlags.DONT_APPLY_OVERLAY not in self._flags + def pre_receive(port_name: str, slot: Optional[int]) -> None: msg, saved_until = self._communicator.receive_message( port_name, slot) diff --git a/libmuscle/python/libmuscle/test/conftest.py b/libmuscle/python/libmuscle/test/conftest.py index 77422ee9..201a10f4 100644 --- a/libmuscle/python/libmuscle/test/conftest.py +++ b/libmuscle/python/libmuscle/test/conftest.py @@ -26,4 +26,4 @@ def message2() -> Message: @pytest.fixture def guard() -> APIGuard: - return APIGuard() + return APIGuard(True) diff --git a/libmuscle/python/libmuscle/test/test_api_guard.py b/libmuscle/python/libmuscle/test/test_api_guard.py index f67bde93..4636c775 100644 --- a/libmuscle/python/libmuscle/test/test_api_guard.py +++ b/libmuscle/python/libmuscle/test/test_api_guard.py @@ -5,20 +5,17 @@ from libmuscle.api_guard import APIGuard -def test_no_checkpointing_support(guard): +def test_no_checkpointing_support(): + guard = APIGuard(False) for _ in range(3): guard.verify_reuse_instance() guard.reuse_instance_done(True) - assert not guard.uses_checkpointing() - guard.verify_reuse_instance() guard.reuse_instance_done(False) - assert not guard.uses_checkpointing() - -def test_final_snapshot_only(guard): +def test_final_snapshot_only(guard: APIGuard): for i in range(4): guard.verify_reuse_instance() guard.reuse_instance_done(True) @@ -48,7 +45,7 @@ def test_final_snapshot_only(guard): guard.reuse_instance_done(False) -def test_full_checkpointing(guard): +def test_full_checkpointing(guard: APIGuard): for i in range(4): guard.verify_reuse_instance() guard.reuse_instance_done(True) @@ -133,20 +130,19 @@ def test_missing_step(guard, fun): check_all_raise_except(guard, {fun}) -def test_missing_resuming(guard): +def test_missing_resuming(guard: APIGuard): run_until_before(guard, APIGuard.verify_resuming) - check_all_raise_except(guard, { - APIGuard.verify_resuming, APIGuard.verify_reuse_instance}) + check_all_raise_except(guard, {APIGuard.verify_resuming}) -def test_missing_should_save_final(guard): +def test_missing_should_save_final(guard: APIGuard): run_until_before(guard, APIGuard.verify_should_save_final_snapshot) check_all_raise_except(guard, { APIGuard.verify_should_save_snapshot, APIGuard.verify_should_save_final_snapshot}) -def test_double_should_save(guard): +def test_double_should_save(guard: APIGuard): run_until_before(guard, APIGuard.verify_should_save_snapshot) guard.verify_should_save_snapshot() guard.should_save_snapshot_done(True) From 262968146b26f12b686692b2f286e346f489d538 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Wed, 11 Jan 2023 17:12:34 +0100 Subject: [PATCH 133/183] Fix docstring --- libmuscle/python/libmuscle/instance.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index 406dcdbd..d0db648a 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -40,10 +40,10 @@ class InstanceFlags(Flag): .. code-block:: python from libmuscle import ( - Instance, USES_CHECKPOINT_API, KEEPS_STATE_FOR_NEXT_USE) + Instance, USES_CHECKPOINT_API, DONT_APPLY_OVERLAY) ports = ... - flags = USES_CHECKPOINT_API | KEEPS_STATE_FOR_NEXT_USE + flags = USES_CHECKPOINT_API | DONT_APPLY_OVERLAY instance = Instance(ports, flags) """ From c3177f667d58260e2c7422cefb270f7216f3be3d Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 12 Jan 2023 10:21:19 +0100 Subject: [PATCH 134/183] Process review comments --- .../test_snapshot_complex_coupling.py | 4 ++-- integration_test/test_snapshot_dispatch.py | 4 ++-- integration_test/test_snapshot_macro_micro.py | 6 +++--- libmuscle/python/libmuscle/instance.py | 16 ++++++++-------- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/integration_test/test_snapshot_complex_coupling.py b/integration_test/test_snapshot_complex_coupling.py index e3b408de..e374347d 100644 --- a/integration_test/test_snapshot_complex_coupling.py +++ b/integration_test/test_snapshot_complex_coupling.py @@ -5,7 +5,7 @@ from ymmsl import Operator, load, dump from libmuscle import ( - Instance, Message, HAS_NO_STATE_FOR_NEXT_USE, USES_CHECKPOINT_API) + Instance, Message, KEEPS_NO_STATE_FOR_NEXT_USE, USES_CHECKPOINT_API) from libmuscle.manager.run_dir import RunDir from .conftest import run_manager_with_actors, ls_snapshots @@ -58,7 +58,7 @@ def cache_component(max_channels=2): def echo_component(max_channels=2): ports = {Operator.F_INIT: [f'in{i+1}' for i in range(max_channels)], Operator.O_F: [f'out{i+1}' for i in range(max_channels)]} - instance = Instance(ports, HAS_NO_STATE_FOR_NEXT_USE) + instance = Instance(ports, KEEPS_NO_STATE_FOR_NEXT_USE) while instance.reuse_instance(): for p_in, p_out in zip(ports[Operator.F_INIT], ports[Operator.O_F]): diff --git a/integration_test/test_snapshot_dispatch.py b/integration_test/test_snapshot_dispatch.py index f604663e..021ac676 100644 --- a/integration_test/test_snapshot_dispatch.py +++ b/integration_test/test_snapshot_dispatch.py @@ -2,7 +2,7 @@ from ymmsl import Operator, load, dump from libmuscle import ( - Instance, Message, HAS_NO_STATE_FOR_NEXT_USE, USES_CHECKPOINT_API) + Instance, Message, KEEPS_NO_STATE_FOR_NEXT_USE, USES_CHECKPOINT_API) from libmuscle.manager.run_dir import RunDir from .conftest import run_manager_with_actors, ls_snapshots @@ -48,7 +48,7 @@ def stateless_component(): instance = Instance({ Operator.F_INIT: ['f_i'], Operator.O_F: ['o_f']}, - HAS_NO_STATE_FOR_NEXT_USE) + KEEPS_NO_STATE_FOR_NEXT_USE) while instance.reuse_instance(): dt = instance.get_setting('dt', 'float') diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py index 2e660606..2f6bc1e2 100644 --- a/integration_test/test_snapshot_macro_micro.py +++ b/integration_test/test_snapshot_macro_micro.py @@ -2,7 +2,7 @@ from ymmsl import Operator, load, dump from libmuscle import ( - Instance, Message, HAS_NO_STATE_FOR_NEXT_USE, USES_CHECKPOINT_API) + Instance, Message, KEEPS_NO_STATE_FOR_NEXT_USE, USES_CHECKPOINT_API) from libmuscle.manager.run_dir import RunDir from .conftest import run_manager_with_actors, ls_snapshots @@ -128,7 +128,7 @@ def stateless_micro(): instance = Instance({ Operator.F_INIT: ['f_i'], Operator.O_F: ['o_f']}, - HAS_NO_STATE_FOR_NEXT_USE) + KEEPS_NO_STATE_FOR_NEXT_USE) while instance.reuse_instance(): dt = instance.get_setting('dt', 'float') @@ -150,7 +150,7 @@ def data_transformer(): instance = Instance({ Operator.F_INIT: ['f_i'], Operator.O_F: ['o_f']}, - HAS_NO_STATE_FOR_NEXT_USE) + KEEPS_NO_STATE_FOR_NEXT_USE) while instance.reuse_instance(): msg = instance.receive('f_i') diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index d0db648a..a1e13ec4 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -64,25 +64,25 @@ class InstanceFlags(Flag): You may not use any checkpointing API calls when this flag is not supplied. """ - HAS_NO_STATE_FOR_NEXT_USE = auto() + KEEPS_NO_STATE_FOR_NEXT_USE = auto() """Indicate this instance does not carry state between iterations of the reuse loop. This corresponds to :external:py:attr:`ymmsl.KeepsStateForNextUse.NO`. - If neither :attr:`HAS_NO_STATE_FOR_NEXT_USE` and - :attr:`STATE_FOR_NEXT_USE_NOT_REQUIRED` are supplied, this corresponds to + If neither :attr:`KEEPS_NO_STATE_FOR_NEXT_USE` and + :attr:`STATE_NOT_REQUIRED_FOR_NEXT_USE` are supplied, this corresponds to :external:py:attr:`ymmsl.KeepsStateForNextUse.REQUIRED`. """ - STATE_FOR_NEXT_USE_NOT_REQUIRED = auto() + STATE_NOT_REQUIRED_FOR_NEXT_USE = auto() """Indicate this instance carries state between iterations of the reuse loop, however this state is not required for restarting. This corresponds to :external:py:attr:`ymmsl.KeepsStateForNextUse.HELPFUL`. - If neither :attr:`HAS_NO_STATE_FOR_NEXT_USE` and - :attr:`STATE_FOR_NEXT_USE_NOT_REQUIRED` are supplied, this corresponds to + If neither :attr:`KEEPS_NO_STATE_FOR_NEXT_USE` and + :attr:`STATE_NOT_REQUIRED_FOR_NEXT_USE` are supplied, this corresponds to :external:py:attr:`ymmsl.KeepsStateForNextUse.REQUIRED`. """ @@ -231,8 +231,8 @@ def reuse_instance(self, apply_overlay: Optional[bool] = None) -> bool: do_implicit_checkpoint = ( not self._first_run and InstanceFlags.USES_CHECKPOINT_API not in self._flags and - (InstanceFlags.STATE_FOR_NEXT_USE_NOT_REQUIRED in self._flags or - InstanceFlags.HAS_NO_STATE_FOR_NEXT_USE in self._flags)) + (InstanceFlags.STATE_NOT_REQUIRED_FOR_NEXT_USE in self._flags or + InstanceFlags.KEEPS_NO_STATE_FOR_NEXT_USE in self._flags)) if do_implicit_checkpoint: if self._trigger_manager.should_save_final_snapshot( From dda214bc507aa44c78d381da39d3685eca34376c Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 12 Jan 2023 13:35:28 +0100 Subject: [PATCH 135/183] Save and restore settings overlay --- libmuscle/python/libmuscle/instance.py | 6 +++++- libmuscle/python/libmuscle/snapshot.py | 12 +++++++++--- libmuscle/python/libmuscle/snapshot_manager.py | 10 +++++++--- libmuscle/python/libmuscle/test/test_snapshot.py | 9 ++++++--- .../python/libmuscle/test/test_snapshot_manager.py | 14 +++++++++----- 5 files changed, 36 insertions(+), 15 deletions(-) diff --git a/libmuscle/python/libmuscle/instance.py b/libmuscle/python/libmuscle/instance.py index 732f6919..39bd0fe5 100644 --- a/libmuscle/python/libmuscle/instance.py +++ b/libmuscle/python/libmuscle/instance.py @@ -119,6 +119,10 @@ def __init__( resume_snapshot, snapshot_dir = checkpoint_info[2:4] saved_at = self._snapshot_manager.prepare_resume( resume_snapshot, snapshot_dir) + # Resume settings overlay + overlay = self._snapshot_manager._resume_overlay + if overlay is not None: + self._settings_manager.overlay = overlay if saved_at is not None: self._trigger_manager.update_checkpoints(saved_at) @@ -767,7 +771,7 @@ def _save_snapshot( walltime = self._trigger_manager.elapsed_walltime() timestamp = self._snapshot_manager.save_snapshot( message, final, triggers, walltime, - f_init_max_timestamp) + f_init_max_timestamp, self._settings_manager.overlay) self._trigger_manager.update_checkpoints(timestamp) def __receive_message( diff --git a/libmuscle/python/libmuscle/snapshot.py b/libmuscle/python/libmuscle/snapshot.py index 2f86a220..cda03dc5 100644 --- a/libmuscle/python/libmuscle/snapshot.py +++ b/libmuscle/python/libmuscle/snapshot.py @@ -21,12 +21,16 @@ def __init__(self, wallclock_time: float, port_message_counts: Dict[str, List[int]], is_final_snapshot: bool, - message: Optional['communicator.Message']) -> None: + message: Optional['communicator.Message'], + settings_overlay: Settings) -> None: self.triggers = triggers self.wallclock_time = wallclock_time self.port_message_counts = port_message_counts self.is_final_snapshot = is_final_snapshot self.message = message + # self.message is None for implicit snapshots, so we cannot store the + # Settings overlay in that message object. + self.settings_overlay = settings_overlay @classmethod @abstractmethod @@ -62,7 +66,8 @@ def from_bytes(cls, data: bytes) -> 'Snapshot': dct['wallclock_time'], dct['port_message_counts'], dct['is_final_snapshot'], - cls.bytes_to_message(dct['message'])) + cls.bytes_to_message(dct['message']), + Settings(dct['settings_overlay'])) def to_bytes(self) -> bytes: return cast(bytes, msgpack.dumps({ @@ -70,7 +75,8 @@ def to_bytes(self) -> bytes: 'wallclock_time': self.wallclock_time, 'port_message_counts': self.port_message_counts, 'is_final_snapshot': self.is_final_snapshot, - 'message': self.message_to_bytes(self.message) + 'message': self.message_to_bytes(self.message), + 'settings_overlay': self.settings_overlay.as_ordered_dict() })) @staticmethod diff --git a/libmuscle/python/libmuscle/snapshot_manager.py b/libmuscle/python/libmuscle/snapshot_manager.py index f756d05d..e23b73dc 100644 --- a/libmuscle/python/libmuscle/snapshot_manager.py +++ b/libmuscle/python/libmuscle/snapshot_manager.py @@ -2,7 +2,7 @@ from pathlib import Path from typing import cast, List, Optional -from ymmsl import Reference, Operator +from ymmsl import Reference, Operator, Settings from libmuscle.communicator import Communicator, Message from libmuscle.mmp_client import MMPClient @@ -44,6 +44,7 @@ def __init__(self, self._manager = manager self._resume_from_snapshot = None # type: Optional[Snapshot] + self._resume_overlay = Settings() self._next_snapshot_num = 1 def prepare_resume( @@ -73,6 +74,7 @@ def prepare_resume( # snapshot.message is None for implicit snapshots self._resume_from_snapshot = snapshot result = snapshot.message.timestamp + self._resume_overlay = snapshot.settings_overlay self._communicator.restore_message_counts( snapshot.port_message_counts) @@ -112,7 +114,8 @@ def load_snapshot(self) -> Message: def save_snapshot( self, msg: Optional[Message], final: bool, triggers: List[str], wallclock_time: float, - f_init_max_timestamp: Optional[float] = None, + f_init_max_timestamp: Optional[float], + settings_overlay: Settings ) -> float: """Save a (final) snapshot. @@ -140,7 +143,8 @@ def save_snapshot( port_message_counts[port_name] = new_counts snapshot = MsgPackSnapshot( - triggers, wallclock_time, port_message_counts, final, msg) + triggers, wallclock_time, port_message_counts, final, msg, + settings_overlay) path = self.__store_snapshot(snapshot) metadata = SnapshotMetadata.from_snapshot(snapshot, str(path)) diff --git a/libmuscle/python/libmuscle/test/test_snapshot.py b/libmuscle/python/libmuscle/test/test_snapshot.py index f459a001..372c8cd0 100644 --- a/libmuscle/python/libmuscle/test/test_snapshot.py +++ b/libmuscle/python/libmuscle/test/test_snapshot.py @@ -13,12 +13,15 @@ def snapshot() -> Snapshot: is_final = True message = Message(1.2, None, 'test_data') snapshot = MsgPackSnapshot( - triggers, wallclock_time, port_message_counts, is_final, message) + triggers, wallclock_time, port_message_counts, is_final, message, + Settings({'test': 1})) assert snapshot.triggers == triggers assert snapshot.wallclock_time == wallclock_time assert snapshot.port_message_counts == port_message_counts assert snapshot.is_final_snapshot == is_final assert snapshot.message == message + assert snapshot.settings_overlay.keys() == {'test'} + assert snapshot.settings_overlay['test'] == 1 return snapshot @@ -53,7 +56,7 @@ def test_snapshot_metadata(snapshot: Snapshot) -> None: def test_message_with_settings() -> None: message = Message(1.0, 2.0, 'test_data', Settings({'setting': True})) - snapshot = MsgPackSnapshot([], 0, {}, False, message) + snapshot = MsgPackSnapshot([], 0, {}, False, message, Settings()) assert snapshot.message.settings.get('setting') is True binary_snapshot = snapshot.to_bytes() @@ -65,7 +68,7 @@ def test_message_with_settings() -> None: def test_implicit_snapshot() -> None: message = None - snapshot = MsgPackSnapshot([], 0, {}, True, message) + snapshot = MsgPackSnapshot([], 0, {}, True, message, Settings()) assert snapshot.message is None binary_snapshot = snapshot.to_bytes() diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py index 31423bb0..e530ad06 100644 --- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py +++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py @@ -1,7 +1,7 @@ from pathlib import Path from unittest.mock import MagicMock -from ymmsl import Reference +from ymmsl import Reference, Settings from libmuscle.communicator import Message from libmuscle.snapshot import SnapshotMetadata @@ -33,7 +33,8 @@ def test_save_load_snapshot(tmp_path: Path) -> None: assert not snapshot_manager.resuming_from_final() snapshot_manager.save_snapshot( - Message(0.2, None, 'test data'), False, ['test'], 13.0) + Message(0.2, None, 'test data'), False, ['test'], 13.0, None, + Settings()) communicator.get_message_counts.assert_called_with() manager.submit_snapshot_metadata.assert_called() @@ -63,7 +64,8 @@ def test_save_load_snapshot(tmp_path: Path) -> None: assert msg.data == 'test data' snapshot_manager2.save_snapshot( - Message(0.6, None, 'test data2'), True, ['test'], 42.2, 1.2) + Message(0.6, None, 'test data2'), True, ['test'], 42.2, 1.2, + Settings()) instance, metadata = manager.submit_snapshot_metadata.call_args[0] assert instance == instance_id @@ -99,7 +101,8 @@ def test_save_load_implicit_snapshot(tmp_path: Path) -> None: assert not snapshot_manager.resuming_from_intermediate() assert not snapshot_manager.resuming_from_final() # save implicit snapshot - snapshot_manager.save_snapshot(None, True, ['implicit'], 1.0, 1.5) + snapshot_manager.save_snapshot( + None, True, ['implicit'], 1.0, 1.5, Settings()) manager.submit_snapshot_metadata.assert_called_once() instance, metadata = manager.submit_snapshot_metadata.call_args[0] @@ -117,5 +120,6 @@ def test_save_load_implicit_snapshot(tmp_path: Path) -> None: assert not snapshot_manager2.resuming_from_intermediate() assert not snapshot_manager2.resuming_from_final() - snapshot_manager2.save_snapshot(None, True, ['implicit'], 12.3, 2.5) + snapshot_manager2.save_snapshot( + None, True, ['implicit'], 12.3, 2.5, Settings()) manager.submit_snapshot_metadata.assert_called_once() From f38cf18ee4c73c2d7fb8fe76e95534fc238a4814 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Thu, 12 Jan 2023 15:07:07 +0100 Subject: [PATCH 136/183] Implement version check Fixes #148 --- libmuscle/cpp/build/libmuscle/Makefile | 22 +++++++---- libmuscle/cpp/src/libmuscle/.gitignore | 1 + libmuscle/cpp/src/libmuscle/mmp_client.cpp | 4 +- libmuscle/cpp/src/libmuscle/version.h.in | 2 +- .../python/libmuscle/manager/mmp_server.py | 13 ++++++- .../manager/test/test_mmp_request_handler.py | 38 ++++++++++++++++++- libmuscle/python/libmuscle/mmp_client.py | 4 +- .../python/libmuscle/test/test_mmp_client.py | 4 +- 8 files changed, 73 insertions(+), 15 deletions(-) create mode 100644 libmuscle/cpp/src/libmuscle/.gitignore diff --git a/libmuscle/cpp/build/libmuscle/Makefile b/libmuscle/cpp/build/libmuscle/Makefile index 3fdc0117..d85d53b5 100644 --- a/libmuscle/cpp/build/libmuscle/Makefile +++ b/libmuscle/cpp/build/libmuscle/Makefile @@ -45,6 +45,7 @@ public_headers := libmuscle/data.hpp libmuscle/data.tpp libmuscle/instance.hpp public_headers += libmuscle/libmuscle.hpp libmuscle/mcp/data_pack.hpp public_headers += libmuscle/mcp/data_pack.tpp libmuscle/message.hpp public_headers += libmuscle/ports_description.hpp libmuscle/util.hpp libmuscle/util.tpp +public_headers += libmuscle/version.h installed_headers := $(public_headers:%=$(PREFIX)/include/%) pkg_config_files := libmuscle.pc @@ -76,7 +77,7 @@ test: tests .PHONY: clean clean: - rm -f libmuscle.a libmuscle.so libmuscle_d.a libmuscle_d.so version.h + rm -f libmuscle.a libmuscle.so libmuscle_d.a libmuscle_d.so $(srcdir)/version.h rm -f libmuscle_mpi.a libmuscle_mpi.so libmuscle_mpi_d.a libmuscle_mpi_d.so rm -f libmuscle.pc libmuscle_mpi.pc rm -rf $(objdir) @@ -116,11 +117,11 @@ LDFLAGS += $(shell export PKG_CONFIG_PATH=$(PKG_CONFIG_PATH):$(PKG_CONFIG_EXTRA_ endif -$(objdir)/%.d: %.cpp +$(objdir)/%.d: %.cpp $(srcdir)/version.h @mkdir -p $(@D) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -E -MM -MT $(@:.d=.o) $< -o $@ -$(objdir)/%.o: %.cpp +$(objdir)/%.o: %.cpp $(srcdir)/version.h @mkdir -p $(@D) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@ @@ -139,28 +140,29 @@ $(objdir)/%.dlo: %.cpp $(objdir)/%.o @mkdir -p $(@D) $(CXX) $(CPPFLAGS) $(CXXFLAGS) $(DEBUGFLAGS) -fPIC -c $< -o $@ -$(objdir)/%.mo: %.cpp +$(objdir)/%.mo: %.cpp $(srcdir)/version.h @mkdir -p $(@D) $(MPICXX) $(CPPFLAGS) $(CXXFLAGS) $(MPIFLAGS) -c $< -o $@ -$(objdir)/%.mlo: %.cpp +$(objdir)/%.mlo: %.cpp $(srcdir)/version.h @mkdir -p $(@D) $(MPICXX) $(CPPFLAGS) $(CXXFLAGS) $(MPIFLAGS) -fPIC -c $< -o $@ -$(objdir)/%.mdo: %.cpp +$(objdir)/%.mdo: %.cpp $(srcdir)/version.h @mkdir -p $(@D) $(MPICXX) $(CPPFLAGS) $(CXXFLAGS) $(DEBUGFLAGS) $(MPIFLAGS) -c $< -o $@ -$(objdir)/%.mdlo: %.cpp +$(objdir)/%.mdlo: %.cpp $(srcdir)/version.h @mkdir -p $(@D) $(MPICXX) $(CPPFLAGS) $(CXXFLAGS) $(DEBUGFLAGS) $(MPIFLAGS) -fPIC -c $< -o $@ -version.h: version.h.in +$(srcdir)/version.h: version.h.in cp $< $@ sed -i -e 's/@PROJECT_VERSION_MAJOR@/$(major_version)/' $@ sed -i -e 's/@PROJECT_VERSION_MINOR@/$(minor_version)/' $@ sed -i -e 's/@PROJECT_VERSION_PATCH@/$(patch_version)/' $@ + sed -i -e 's/@PROJECT_VERSION@/$(muscle_version)/' $@ libmuscle.a: $(objects) ar rcs $@ $^ @@ -186,6 +188,10 @@ libmuscle_mpi_d.a: $(mdobjects) libmuscle_mpi_d.so: $(mdlobjects) $(MPICXX) -shared -Wl,--version-script=libmuscle_mpi.version -o $@ $^ $(LDFLAGS) +$(PREFIX)/include/libmuscle/version.h: $(srcdir)/version.h + @mkdir -p $(@D) + cp $< $@ + $(PREFIX)/include/%.hpp: $(hdrdir)/%.hpp @mkdir -p $(@D) cp $< $@ diff --git a/libmuscle/cpp/src/libmuscle/.gitignore b/libmuscle/cpp/src/libmuscle/.gitignore new file mode 100644 index 00000000..67020331 --- /dev/null +++ b/libmuscle/cpp/src/libmuscle/.gitignore @@ -0,0 +1 @@ +version.h diff --git a/libmuscle/cpp/src/libmuscle/mmp_client.cpp b/libmuscle/cpp/src/libmuscle/mmp_client.cpp index de50e894..6a7f33d6 100644 --- a/libmuscle/cpp/src/libmuscle/mmp_client.cpp +++ b/libmuscle/cpp/src/libmuscle/mmp_client.cpp @@ -3,6 +3,7 @@ #include "libmuscle/data.hpp" #include "libmuscle/mcp/data_pack.hpp" #include "libmuscle/mcp/protocol.hpp" +#include "libmuscle/version.h" #include #include @@ -101,7 +102,8 @@ void MMPClient::register_instance( auto request = Data::list( static_cast(RequestType::register_instance), - std::string(name), encoded_locs, encoded_ports); + std::string(name), encoded_locs, encoded_ports, + MUSCLE3_VERSION); auto response = call_manager_(request); diff --git a/libmuscle/cpp/src/libmuscle/version.h.in b/libmuscle/cpp/src/libmuscle/version.h.in index 8edb3a47..67718812 100644 --- a/libmuscle/cpp/src/libmuscle/version.h.in +++ b/libmuscle/cpp/src/libmuscle/version.h.in @@ -4,5 +4,5 @@ #define MUSCLE3_VERSION_MINOR @PROJECT_VERSION_MINOR@ #define MUSCLE3_VERSION_PATCH @PROJECT_VERSION_PATCH@ -#define MUSCLE3_VERSION "MUSCLE3_VERSION_MAJOR.MUSCLE3_VERSION_MINOR.MUSCLE3_VERSION_PATCH" +#define MUSCLE3_VERSION "@PROJECT_VERSION@" diff --git a/libmuscle/python/libmuscle/manager/mmp_server.py b/libmuscle/python/libmuscle/manager/mmp_server.py index d609fce1..6fc4bae2 100644 --- a/libmuscle/python/libmuscle/manager/mmp_server.py +++ b/libmuscle/python/libmuscle/manager/mmp_server.py @@ -8,6 +8,7 @@ Conduit, Identifier, Operator, Port, Reference, PartialConfiguration, Checkpoints) +import libmuscle from libmuscle.logging import LogLevel from libmuscle.manager.instance_registry import ( AlreadyRegistered, InstanceRegistry) @@ -109,12 +110,14 @@ def handle_request(self, request: bytes) -> bytes: def _register_instance( self, instance_id: str, locations: List[str], - ports: List[List[str]]) -> Any: + ports: List[List[str]], version: str = '') -> Any: """Handle a register instance request. Args: instance_id: ID of the instance to register locations: Locations where it can be reached + ports: Ports of this instance + version: Version of libmuscle that this instance uses Returns: A list containing the following values: @@ -123,6 +126,14 @@ def _register_instance( error_msg (str): An error message, only present if status equals ERROR """ + if version != libmuscle.__version__: + return [ + ResponseType.ERROR.value, + f'Instance libmuscle version ({version}) does not match' + f' manager libmuscle version ({libmuscle.__version__}).' + ' Please ensure that the instance and the manager use the' + ' same version of libmuscle.'] + port_objs = [decode_port(p) for p in ports] instance = Reference(instance_id) try: diff --git a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py index bc61f0a0..4b615d55 100644 --- a/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py +++ b/libmuscle/python/libmuscle/manager/test/test_mmp_request_handler.py @@ -6,6 +6,7 @@ from ymmsl import ( Operator, Reference, Checkpoints, CheckpointRangeRule, CheckpointAtRule) +import libmuscle from libmuscle.logging import LogLevel from libmuscle.manager.mmp_server import MMPRequestHandler from libmuscle.mcp.protocol import RequestType, ResponseType @@ -79,7 +80,8 @@ def test_register_instance(mmp_request_handler, instance_registry): RequestType.REGISTER_INSTANCE.value, 'test_instance', ['tcp://localhost:10000'], - [['test_in', 'F_INIT']]] + [['test_in', 'F_INIT']], + libmuscle.__version__] encoded_request = msgpack.packb(request, use_bin_type=True) result = mmp_request_handler.handle_request(encoded_request) @@ -94,6 +96,37 @@ def test_register_instance(mmp_request_handler, instance_registry): assert registered_ports['test_instance'][0].operator == Operator.F_INIT +def test_register_instance_no_version(mmp_request_handler): + request = [ + RequestType.REGISTER_INSTANCE.value, + 'test_instance', + ['tcp://localhost:10000'], + [['test_in', 'F_INIT']]] + encoded_request = msgpack.packb(request, use_bin_type=True) + + result = mmp_request_handler.handle_request(encoded_request) + decoded_result = msgpack.unpackb(result, raw=False) + + assert decoded_result[0] == ResponseType.ERROR.value + assert 'version' in decoded_result[1] + + +def test_register_instance_version_mismatch(mmp_request_handler): + request = [ + RequestType.REGISTER_INSTANCE.value, + 'test_instance', + ['tcp://localhost:10000'], + [['test_in', 'F_INIT']], + libmuscle.__version__ + "dev"] + encoded_request = msgpack.packb(request, use_bin_type=True) + + result = mmp_request_handler.handle_request(encoded_request) + decoded_result = msgpack.unpackb(result, raw=False) + + assert decoded_result[0] == ResponseType.ERROR.value + assert 'version' in decoded_result[1] + + def test_get_checkpoint_info(mmp_configuration, mmp_request_handler): resume_path = Path('/path/to/resume.pack') mmp_configuration.resume = {Reference('test_instance'): resume_path} @@ -145,7 +178,8 @@ def test_double_register_instance(mmp_request_handler): RequestType.REGISTER_INSTANCE.value, 'test_instance', ['tcp://localhost:10000'], - [['test_in', 'F_INIT']]] + [['test_in', 'F_INIT']], + libmuscle.__version__] encoded_request = msgpack.packb(request, use_bin_type=True) result = mmp_request_handler.handle_request(encoded_request) diff --git a/libmuscle/python/libmuscle/mmp_client.py b/libmuscle/python/libmuscle/mmp_client.py index eed4d99a..1deded1e 100644 --- a/libmuscle/python/libmuscle/mmp_client.py +++ b/libmuscle/python/libmuscle/mmp_client.py @@ -9,6 +9,7 @@ Conduit, Operator, Port, Reference, Settings, Checkpoints, CheckpointRule, CheckpointRangeRule, CheckpointAtRule) +import libmuscle from libmuscle.mcp.protocol import RequestType, ResponseType from libmuscle.mcp.tcp_transport_client import TcpTransportClient from libmuscle.profiling import ProfileEvent @@ -192,7 +193,8 @@ def register_instance(self, name: Reference, locations: List[str], request = [ RequestType.REGISTER_INSTANCE.value, str(name), locations, - [encode_port(p) for p in ports]] + [encode_port(p) for p in ports], + libmuscle.__version__] response = self._call_manager(request) if response[0] == ResponseType.ERROR.value: raise RuntimeError( diff --git a/libmuscle/python/libmuscle/test/test_mmp_client.py b/libmuscle/python/libmuscle/test/test_mmp_client.py index d5051962..51874e5d 100644 --- a/libmuscle/python/libmuscle/test/test_mmp_client.py +++ b/libmuscle/python/libmuscle/test/test_mmp_client.py @@ -4,6 +4,7 @@ import pytest from ymmsl import Conduit, Operator, Port, Reference +import libmuscle from libmuscle.logging import LogLevel, LogMessage, Timestamp from libmuscle.mcp.protocol import RequestType, ResponseType from libmuscle.mmp_client import MMPClient @@ -85,7 +86,8 @@ def test_register_instance(mocked_mmp_client) -> None: sent_msg = msgpack.unpackb(stub.call.call_args[0][0], raw=False) assert sent_msg == [ RequestType.REGISTER_INSTANCE.value, 'kernel[13]', - ['direct:test', 'tcp:test'], [['out', 'O_I'], ['in', 'S']]] + ['direct:test', 'tcp:test'], [['out', 'O_I'], ['in', 'S']], + libmuscle.__version__] def test_request_peers(mocked_mmp_client) -> None: From f0553c38a3319adb1eef0dcd3ad29c0bb9f2a92d Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 13 Jan 2023 10:19:42 +0100 Subject: [PATCH 137/183] Improve Message construction syntax Fixes #146, #125 - Python: add default (None) values for next_timestamp and data in Message.__init__ - C++: add Message(timestamp) constructor - Fortran: add LIBMUSCLE_Message_create(timestamp) Update Python examples to use `Message(t, data=...)` instead of `Message(t, None, ...)` --- docs/source/examples/python/diffusion.py | 2 +- .../examples/python/interact_coupling.py | 4 ++-- docs/source/examples/python/reaction.py | 2 +- .../examples/python/reaction_diffusion.py | 2 +- .../examples/python/reaction_diffusion_qmc.py | 6 ++--- docs/source/fortran_api.rst | 7 ++++++ docs/source/tutorial.rst | 9 ++++---- docs/source/uncertainty_quantification.rst | 4 ++-- integration_test/test_all.py | 2 +- integration_test/test_duplication_mapper.py | 2 +- integration_test/test_multicast.py | 2 +- integration_test/test_parameter_overlays.py | 5 ++-- .../test_snapshot_complex_coupling.py | 16 ++++++------- integration_test/test_snapshot_dispatch.py | 12 +++++----- integration_test/test_snapshot_interact.py | 4 ++-- integration_test/test_snapshot_macro_micro.py | 16 ++++++------- .../cpp/build/libmuscle/libmuscle.version | 1 + .../cpp/build/libmuscle/libmuscle_mpi.version | 1 + .../bindings/libmuscle_fortran_c.cpp | 5 ++++ .../bindings/libmuscle_mpi_fortran_c.cpp | 5 ++++ libmuscle/cpp/src/libmuscle/message.cpp | 8 +++++++ libmuscle/cpp/src/libmuscle/message.hpp | 6 +++++ .../tests/mocks/mock_communicator.cpp | 4 ++-- libmuscle/fortran/src/libmuscle/libmuscle.f90 | 23 +++++++++++++++++++ .../fortran/src/libmuscle/libmuscle_mpi.f90 | 23 +++++++++++++++++++ libmuscle/python/libmuscle/communicator.py | 4 ++-- .../python/libmuscle/test/test_snapshot.py | 2 +- .../libmuscle/test/test_snapshot_manager.py | 4 ++-- scripts/make_libmuscle_api.py | 3 ++- 29 files changed, 131 insertions(+), 53 deletions(-) diff --git a/docs/source/examples/python/diffusion.py b/docs/source/examples/python/diffusion.py index e4ad4726..994a5e1a 100644 --- a/docs/source/examples/python/diffusion.py +++ b/docs/source/examples/python/diffusion.py @@ -75,7 +75,7 @@ def diffusion() -> None: t_cur += dt # O_F - final_state_msg = Message(t_cur, None, Grid(U, ['x'])) + final_state_msg = Message(t_cur, data=Grid(U, ['x'])) instance.send('final_state_out', final_state_msg) if 'DONTPLOT' not in os.environ and 'SLURM_NODENAME' not in os.environ: diff --git a/docs/source/examples/python/interact_coupling.py b/docs/source/examples/python/interact_coupling.py index 3df5e11e..8014dad4 100644 --- a/docs/source/examples/python/interact_coupling.py +++ b/docs/source/examples/python/interact_coupling.py @@ -305,11 +305,11 @@ def checkpointing_temporal_coupler() -> None: t_cur = min(a.rcvd, b.rcvd) if instance.should_save_snapshot(t_cur): instance.save_snapshot(Message( - t_cur, None, {'a': a.get_state(), 'b': b.get_state()})) + t_cur, data={'a': a.get_state(), 'b': b.get_state()})) t_cur = min(a.rcvd, b.rcvd) if instance.should_save_final_snapshot(): - instance.save_final_snapshot(Message(t_cur, None, None)) + instance.save_final_snapshot(Message(t_cur)) if __name__ == '__main__': diff --git a/docs/source/examples/python/reaction.py b/docs/source/examples/python/reaction.py index aad03ba0..67e3d92e 100644 --- a/docs/source/examples/python/reaction.py +++ b/docs/source/examples/python/reaction.py @@ -30,7 +30,7 @@ def reaction() -> None: t_cur += dt # O_F - instance.send('final_state', Message(t_cur, None, Grid(U, ['x']))) + instance.send('final_state', Message(t_cur, data=Grid(U, ['x']))) if __name__ == '__main__': diff --git a/docs/source/examples/python/reaction_diffusion.py b/docs/source/examples/python/reaction_diffusion.py index 06001106..75958d5f 100644 --- a/docs/source/examples/python/reaction_diffusion.py +++ b/docs/source/examples/python/reaction_diffusion.py @@ -34,7 +34,7 @@ def reaction() -> None: t_cur += dt # O_F - instance.send('final_state', Message(t_cur, None, Grid(U, ['x']))) + instance.send('final_state', Message(t_cur, data=Grid(U, ['x']))) def laplacian(Z: np.array, dx: float) -> np.array: diff --git a/docs/source/examples/python/reaction_diffusion_qmc.py b/docs/source/examples/python/reaction_diffusion_qmc.py index 1f203de1..f96de4b1 100644 --- a/docs/source/examples/python/reaction_diffusion_qmc.py +++ b/docs/source/examples/python/reaction_diffusion_qmc.py @@ -35,7 +35,7 @@ def reaction() -> None: t_cur += dt # O_F - instance.send('final_state', Message(t_cur, None, Grid(U, ['x']))) + instance.send('final_state', Message(t_cur, data=Grid(U, ['x']))) def laplacian(Z: np.array, dx: float) -> np.array: @@ -105,7 +105,7 @@ def diffusion() -> None: t_cur += dt # O_F - instance.send('final_state_out', Message(t_cur, None, Grid(U, ['x']))) + instance.send('final_state_out', Message(t_cur, data=Grid(U, ['x']))) def load_balancer() -> None: @@ -200,7 +200,7 @@ def qmc_driver() -> None: uq_parameters = Settings({ 'd': ds[sample], 'k': ks[sample]}) - msg = Message(0.0, None, uq_parameters) + msg = Message(0.0, data=uq_parameters) instance.send('parameters_out', msg, sample) # S diff --git a/docs/source/fortran_api.rst b/docs/source/fortran_api.rst index 0d981576..895a49bd 100644 --- a/docs/source/fortran_api.rst +++ b/docs/source/fortran_api.rst @@ -1257,6 +1257,13 @@ LIBMUSCLE_Message will be overlaid onto the receiving model's settings; this is normally only used by special simulation components. +.. f:function:: LIBMUSCLE_Message_create(timestamp) + + Create a new Message object. + + :p LIBMUSCLE_real8 timestamp: The simulated time to which the data in this + message applies. + .. f:function:: LIBMUSCLE_Message_create(timestamp, data) Create a new Message object. diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index ab260230..3e10895b 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -228,7 +228,7 @@ Sending the final result .. code-block:: python # O_F - instance.send('final_state', Message(t_cur, None, Grid(U, ['x']))) + instance.send('final_state', Message(t_cur, data=Grid(U, ['x']))) After the update loop is done, the model has arrived at its final state. We @@ -240,9 +240,7 @@ Execution Loop, so that is where we declared this port to live in our To send a message, we specify the port on which to send (which must match the declaration by name and operator), and a Message object containing the current -simulation time and the current state, converted to a Grid. The optional second -parameter is a second timestamp, which will be discussed below, and is set to -``None`` here. +simulation time and the current state, converted to a Grid. MUSCLE3 uses `MessagePack `_ to encode messages between models. MessagePack is a binary encoding format which can be thought of as a @@ -334,7 +332,8 @@ So, to make your submodel more generically usable, it's good to set the second timestamp. But perhaps you're trying to connect an existing codebase that uses varying timestep sizes, and it's not easy to get it to tell you how big the next timestep will be. In that case, if you're not doing time scale overlap, -just put ``None`` there and move on to the next problem, it'll work just fine. +just create your message via ``Message(timestamp, data=...)`` or put ``None`` +as ``next_timestamp`` and move on to the next problem, it'll work just fine. Receiving messages with a default --------------------------------- diff --git a/docs/source/uncertainty_quantification.rst b/docs/source/uncertainty_quantification.rst index 1a25a98b..36dff972 100644 --- a/docs/source/uncertainty_quantification.rst +++ b/docs/source/uncertainty_quantification.rst @@ -174,7 +174,7 @@ this case, the port will be resizable and it will work as intended. uq_parameters = Settings({ 'd': ds[sample], 'k': ks[sample]}) - msg = Message(0.0, None, uq_parameters) + msg = Message(0.0, data=uq_parameters) instance.send('parameters_out', msg, sample) Since we only run our O_I and S once, we do not have a state update loop that @@ -189,7 +189,7 @@ in the central configuration. Next, we create a :class:`libmuscle.Message` object to send. Since our models will start at time 0, we'll set that as the timestamp, and since we're only -running them once each, the next timestamp is ``None``. For the data, we send +running them once each, we omit the next timestamp. For the data, we send the ``Settings`` object. (MUSCLE3 contains special support for sending ``Settings`` objects, since being objects they're not normally MessagePack-serialisable.) diff --git a/integration_test/test_all.py b/integration_test/test_all.py index 8764f3a4..a9c578e0 100644 --- a/integration_test/test_all.py +++ b/integration_test/test_all.py @@ -56,7 +56,7 @@ def micro(): 'int': 42, 'float': 3.1416, 'grid': Grid(np.array([[12.0, 34.0, 56.0], [1.0, 2.0, 3.0]]))} - instance.send('out', Message(0.1, None, result)) + instance.send('out', Message(0.1, data=result)) def test_all(log_file_in_tmpdir): diff --git a/integration_test/test_duplication_mapper.py b/integration_test/test_duplication_mapper.py index 9717ad36..7f65d6ad 100644 --- a/integration_test/test_duplication_mapper.py +++ b/integration_test/test_duplication_mapper.py @@ -14,7 +14,7 @@ def duplication_mapper(): # o_f out_ports = instance.list_ports()[Operator.O_F] - message = Message(0.0, None, 'testing') + message = Message(0.0, data='testing') for out_port in out_ports: instance.send(out_port, message) diff --git a/integration_test/test_multicast.py b/integration_test/test_multicast.py index 8dedee17..587f722e 100644 --- a/integration_test/test_multicast.py +++ b/integration_test/test_multicast.py @@ -10,7 +10,7 @@ def multicaster(): while instance.reuse_instance(): # o_f - message = Message(0.0, None, 'testing') + message = Message(0.0, data='testing') instance.send('out', message) diff --git a/integration_test/test_parameter_overlays.py b/integration_test/test_parameter_overlays.py index cf091193..1e342e5d 100644 --- a/integration_test/test_parameter_overlays.py +++ b/integration_test/test_parameter_overlays.py @@ -22,8 +22,7 @@ def qmc(): length = instance.get_port_length('settings_out') assert length == 10 for slot in range(length): - instance.send('settings_out', - Message(0.0, None, settings0), slot) + instance.send('settings_out', Message(0.0, data=settings0), slot) def macro(): @@ -87,7 +86,7 @@ def micro(): # instance.receive_with_settings('in') # o_f - instance.send('out', Message(0.1, None, 'testing back')) + instance.send('out', Message(0.1, data='testing back')) def test_settings_overlays(log_file_in_tmpdir): diff --git a/integration_test/test_snapshot_complex_coupling.py b/integration_test/test_snapshot_complex_coupling.py index dad2ee34..ba612075 100644 --- a/integration_test/test_snapshot_complex_coupling.py +++ b/integration_test/test_snapshot_complex_coupling.py @@ -23,7 +23,7 @@ def cache_component(max_channels=2): cache_t = float('-inf') cache_data = [] max_cache_age = None - nil_msg = Message(0.0, None, None) + nil_msg = Message(0.0) while instance.reuse_instance(): if instance.resuming(): @@ -41,17 +41,17 @@ def cache_component(max_channels=2): if cur_t - cache_t >= max_cache_age: # Cached value is no longer valid, run submodel for updated data for msg, port in zip(msgs, ports[Operator.O_I]): - instance.send(port, Message(cur_t, None, msg.data)) + instance.send(port, Message(cur_t, data=msg.data)) cache_data = [instance.receive(port, default=nil_msg).data for port in ports[Operator.S]] cache_t = cur_t max_cache_age = random.uniform(*cache_valid_range) for data, port in zip(cache_data, ports[Operator.O_F]): - instance.send(port, Message(cur_t, None, data)) + instance.send(port, Message(cur_t, data=data)) if instance.should_save_final_snapshot(): - instance.save_final_snapshot(Message(cur_t, None, [])) + instance.save_final_snapshot(Message(cur_t, data=[])) def echo_component(max_channels=2): @@ -87,7 +87,7 @@ def main_component(): i = 0 while time.monotonic() < monotonic_end: - instance.send('state_out', Message(t_cur, None, i)) + instance.send('state_out', Message(t_cur, data=i)) for port in ('Ai', 'Bi', 'Ci', 'Di'): instance.receive(port) @@ -97,12 +97,12 @@ def main_component(): if instance.should_save_snapshot(t_cur): instance.save_snapshot(Message( - t_cur, None, [i, monotonic_end - time.monotonic()])) + t_cur, data=[i, monotonic_end - time.monotonic()])) - instance.send('o_f', Message(t_cur, None, i)) + instance.send('o_f', Message(t_cur, data=i)) if instance.should_save_final_snapshot(): - instance.save_final_snapshot(Message(t_cur, None, [i, 0])) + instance.save_final_snapshot(Message(t_cur, data=[i, 0])) @pytest.fixture diff --git a/integration_test/test_snapshot_dispatch.py b/integration_test/test_snapshot_dispatch.py index 7102a43c..dc1b1aec 100644 --- a/integration_test/test_snapshot_dispatch.py +++ b/integration_test/test_snapshot_dispatch.py @@ -25,7 +25,7 @@ def component(): i, t_stop = msg.data if instance.should_init(): - msg = instance.receive('f_i', default=Message(0, None, 0)) + msg = instance.receive('f_i', default=Message(0, data=0)) t_cur = msg.timestamp i = msg.data t_stop = t_cur + t_max @@ -35,12 +35,12 @@ def component(): t_cur += dt if instance.should_save_snapshot(t_cur): - instance.save_snapshot(Message(t_cur, None, [i, t_stop])) + instance.save_snapshot(Message(t_cur, data=[i, t_stop])) - instance.send('o_f', Message(t_cur, None, i)) + instance.send('o_f', Message(t_cur, data=i)) if instance.should_save_final_snapshot(): - instance.save_final_snapshot(Message(t_cur, None, [i, t_stop])) + instance.save_final_snapshot(Message(t_cur, data=[i, t_stop])) def stateless_component(): @@ -53,7 +53,7 @@ def stateless_component(): dt = instance.get_setting('dt', 'float') t_max = instance.get_setting('t_max', 'float') - msg = instance.receive('f_i', default=Message(0, None, 0)) + msg = instance.receive('f_i', default=Message(0, data=0)) t_cur = msg.timestamp i = msg.data t_stop = t_cur + t_max @@ -62,7 +62,7 @@ def stateless_component(): # faux time-integration for testing snapshots t_cur += dt - instance.send('o_f', Message(t_cur, None, i)) + instance.send('o_f', Message(t_cur, data=i)) @pytest.fixture diff --git a/integration_test/test_snapshot_interact.py b/integration_test/test_snapshot_interact.py index 5492f9e2..4cb32b9a 100644 --- a/integration_test/test_snapshot_interact.py +++ b/integration_test/test_snapshot_interact.py @@ -58,10 +58,10 @@ def component(): i += 1 if instance.should_save_snapshot(t_cur): - instance.save_snapshot(Message(t_cur, None, [i, t_stop])) + instance.save_snapshot(Message(t_cur, data=[i, t_stop])) if instance.should_save_final_snapshot(): - instance.save_final_snapshot(Message(t_cur, None, [i, t_stop])) + instance.save_final_snapshot(Message(t_cur, data=[i, t_stop])) def test_snapshot_interact_lockstep(tmp_path): diff --git a/integration_test/test_snapshot_macro_micro.py b/integration_test/test_snapshot_macro_micro.py index 885ac704..ee2f0011 100644 --- a/integration_test/test_snapshot_macro_micro.py +++ b/integration_test/test_snapshot_macro_micro.py @@ -43,10 +43,10 @@ def macro(): t_cur += dt if instance.should_save_snapshot(t_cur): - instance.save_snapshot(Message(t_cur, None, i)) + instance.save_snapshot(Message(t_cur, data=i)) if instance.should_save_final_snapshot(): - instance.save_final_snapshot(Message(t_cur, None, i)) + instance.save_final_snapshot(Message(t_cur, data=i)) def macro_vector(): @@ -84,10 +84,10 @@ def macro_vector(): t_cur += dt if instance.should_save_snapshot(t_cur): - instance.save_snapshot(Message(t_cur, None, i)) + instance.save_snapshot(Message(t_cur, data=i)) if instance.should_save_final_snapshot(): - instance.save_final_snapshot(Message(t_cur, None, i)) + instance.save_final_snapshot(Message(t_cur, data=i)) def micro(): @@ -115,12 +115,12 @@ def micro(): t_cur += dt if instance.should_save_snapshot(t_cur): - instance.save_snapshot(Message(t_cur, None, [i, t_stop])) + instance.save_snapshot(Message(t_cur, data=[i, t_stop])) - instance.send('o_f', Message(t_cur, None, i)) + instance.send('o_f', Message(t_cur, data=i)) if instance.should_save_final_snapshot(): - instance.save_final_snapshot(Message(t_cur, None, [i, t_stop])) + instance.save_final_snapshot(Message(t_cur, data=[i, t_stop])) def stateless_micro(): @@ -142,7 +142,7 @@ def stateless_micro(): # faux time-integration for testing snapshots t_cur += dt - instance.send('o_f', Message(t_cur, None, i)) + instance.send('o_f', Message(t_cur, data=i)) def data_transformer(): diff --git a/libmuscle/cpp/build/libmuscle/libmuscle.version b/libmuscle/cpp/build/libmuscle/libmuscle.version index 4dc0e9b0..6a5400c4 100644 --- a/libmuscle/cpp/build/libmuscle/libmuscle.version +++ b/libmuscle/cpp/build/libmuscle/libmuscle.version @@ -303,6 +303,7 @@ LIBMUSCLE_PortsDescription_add_; LIBMUSCLE_PortsDescription_num_ports_; LIBMUSCLE_PortsDescription_get_; + LIBMUSCLE_Message_create_t_; LIBMUSCLE_Message_create_td_; LIBMUSCLE_Message_create_tnd_; LIBMUSCLE_Message_create_tds_; diff --git a/libmuscle/cpp/build/libmuscle/libmuscle_mpi.version b/libmuscle/cpp/build/libmuscle/libmuscle_mpi.version index dac26ff1..a6cb4915 100644 --- a/libmuscle/cpp/build/libmuscle/libmuscle_mpi.version +++ b/libmuscle/cpp/build/libmuscle/libmuscle_mpi.version @@ -303,6 +303,7 @@ LIBMUSCLE_PortsDescription_add_; LIBMUSCLE_PortsDescription_num_ports_; LIBMUSCLE_PortsDescription_get_; + LIBMUSCLE_Message_create_t_; LIBMUSCLE_Message_create_td_; LIBMUSCLE_Message_create_tnd_; LIBMUSCLE_Message_create_tds_; diff --git a/libmuscle/cpp/src/libmuscle/bindings/libmuscle_fortran_c.cpp b/libmuscle/cpp/src/libmuscle/bindings/libmuscle_fortran_c.cpp index 418a9c89..d20562d5 100644 --- a/libmuscle/cpp/src/libmuscle/bindings/libmuscle_fortran_c.cpp +++ b/libmuscle/cpp/src/libmuscle/bindings/libmuscle_fortran_c.cpp @@ -3549,6 +3549,11 @@ void LIBMUSCLE_PortsDescription_get_(std::intptr_t self, int op, std::size_t i, } } +std::intptr_t LIBMUSCLE_Message_create_t_(double timestamp) { + Message * result = new Message(timestamp); + return reinterpret_cast(result); +} + std::intptr_t LIBMUSCLE_Message_create_td_(double timestamp, std::intptr_t data) { Data * data_p = reinterpret_cast(data); Message * result = new Message(timestamp, *data_p); diff --git a/libmuscle/cpp/src/libmuscle/bindings/libmuscle_mpi_fortran_c.cpp b/libmuscle/cpp/src/libmuscle/bindings/libmuscle_mpi_fortran_c.cpp index 877bb2a6..fb1ba471 100644 --- a/libmuscle/cpp/src/libmuscle/bindings/libmuscle_mpi_fortran_c.cpp +++ b/libmuscle/cpp/src/libmuscle/bindings/libmuscle_mpi_fortran_c.cpp @@ -3549,6 +3549,11 @@ void LIBMUSCLE_PortsDescription_get_(std::intptr_t self, int op, std::size_t i, } } +std::intptr_t LIBMUSCLE_Message_create_t_(double timestamp) { + Message * result = new Message(timestamp); + return reinterpret_cast(result); +} + std::intptr_t LIBMUSCLE_Message_create_td_(double timestamp, std::intptr_t data) { Data * data_p = reinterpret_cast(data); Message * result = new Message(timestamp, *data_p); diff --git a/libmuscle/cpp/src/libmuscle/message.cpp b/libmuscle/cpp/src/libmuscle/message.cpp index 0809efd3..e5a648f9 100644 --- a/libmuscle/cpp/src/libmuscle/message.cpp +++ b/libmuscle/cpp/src/libmuscle/message.cpp @@ -6,6 +6,14 @@ using ymmsl::Settings; namespace libmuscle { namespace impl { +Message::Message( + double timestamp) + : timestamp_(timestamp) + , next_timestamp_() + , data_() + , settings_() +{} + Message::Message( double timestamp, DataConstRef const & data) diff --git a/libmuscle/cpp/src/libmuscle/message.hpp b/libmuscle/cpp/src/libmuscle/message.hpp index d50b1ad8..ab2ec186 100644 --- a/libmuscle/cpp/src/libmuscle/message.hpp +++ b/libmuscle/cpp/src/libmuscle/message.hpp @@ -17,6 +17,12 @@ namespace libmuscle { namespace impl { // out on the wire. See libmuscle::mcp::Message for that. class Message { public: + /** Create an empty Message. + * + * @param timestamp Simulation time for which this data is valid. + */ + Message(double timestamp); + /** Create a Message. * * @param timestamp Simulation time for which this data is valid. diff --git a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_communicator.cpp b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_communicator.cpp index 3a923511..0f01a3a8 100644 --- a/libmuscle/cpp/src/libmuscle/tests/mocks/mock_communicator.cpp +++ b/libmuscle/cpp/src/libmuscle/tests/mocks/mock_communicator.cpp @@ -107,7 +107,7 @@ void MockCommunicator::reset() { next_received_message.clear(); list_ports_return_value.clear(); last_sent_port = ""; - last_sent_message = Message(0.0, Data()); + last_sent_message = Message(0.0); last_sent_slot = {}; } @@ -126,7 +126,7 @@ PortsDescription MockCommunicator::list_ports_return_value; std::string MockCommunicator::last_sent_port; -Message MockCommunicator::last_sent_message(0.0, Data()); +Message MockCommunicator::last_sent_message(0.0); Optional MockCommunicator::last_sent_slot; diff --git a/libmuscle/fortran/src/libmuscle/libmuscle.f90 b/libmuscle/fortran/src/libmuscle/libmuscle.f90 index edbf529d..7c19c494 100644 --- a/libmuscle/fortran/src/libmuscle/libmuscle.f90 +++ b/libmuscle/fortran/src/libmuscle/libmuscle.f90 @@ -402,6 +402,7 @@ module libmuscle end type LIBMUSCLE_Message public :: LIBMUSCLE_Message + public :: LIBMUSCLE_Message_create_t public :: LIBMUSCLE_Message_create_td public :: LIBMUSCLE_Message_create_tnd public :: LIBMUSCLE_Message_create_tds @@ -2784,6 +2785,13 @@ subroutine LIBMUSCLE_PortsDescription_get_( & integer (c_size_t), intent(out) :: err_msg_len end subroutine LIBMUSCLE_PortsDescription_get_ + integer (c_intptr_t) function LIBMUSCLE_Message_create_t_(timestamp) & + bind(C, name="LIBMUSCLE_Message_create_t_") + + use iso_c_binding + real (c_double), value, intent(in) :: timestamp + end function LIBMUSCLE_Message_create_t_ + integer (c_intptr_t) function LIBMUSCLE_Message_create_td_( & timestamp, & data) & @@ -3798,6 +3806,7 @@ end function LIBMUSCLE_Instance_receive_with_settings_psd_ interface LIBMUSCLE_Message_create module procedure & + LIBMUSCLE_Message_create_t, & LIBMUSCLE_Message_create_td, & LIBMUSCLE_Message_create_tnd, & LIBMUSCLE_Message_create_tds, & @@ -16172,6 +16181,20 @@ function LIBMUSCLE_PortsDescription_get( & end do end function LIBMUSCLE_PortsDescription_get + function LIBMUSCLE_Message_create_t( & + timestamp) + implicit none + real (LIBMUSCLE_real8), intent(in) :: timestamp + type(LIBMUSCLE_Message) :: LIBMUSCLE_Message_create_t + + integer (c_intptr_t) :: ret_val + + ret_val = LIBMUSCLE_Message_create_t_( & + timestamp) + + LIBMUSCLE_Message_create_t%ptr = ret_val + end function LIBMUSCLE_Message_create_t + function LIBMUSCLE_Message_create_td( & timestamp, & data) diff --git a/libmuscle/fortran/src/libmuscle/libmuscle_mpi.f90 b/libmuscle/fortran/src/libmuscle/libmuscle_mpi.f90 index 5f56e3ab..76a9940f 100644 --- a/libmuscle/fortran/src/libmuscle/libmuscle_mpi.f90 +++ b/libmuscle/fortran/src/libmuscle/libmuscle_mpi.f90 @@ -403,6 +403,7 @@ module libmuscle_mpi end type LIBMUSCLE_Message public :: LIBMUSCLE_Message + public :: LIBMUSCLE_Message_create_t public :: LIBMUSCLE_Message_create_td public :: LIBMUSCLE_Message_create_tnd public :: LIBMUSCLE_Message_create_tds @@ -2789,6 +2790,13 @@ subroutine LIBMUSCLE_PortsDescription_get_( & integer (c_size_t), intent(out) :: err_msg_len end subroutine LIBMUSCLE_PortsDescription_get_ + integer (c_intptr_t) function LIBMUSCLE_Message_create_t_(timestamp) & + bind(C, name="LIBMUSCLE_Message_create_t_") + + use iso_c_binding + real (c_double), value, intent(in) :: timestamp + end function LIBMUSCLE_Message_create_t_ + integer (c_intptr_t) function LIBMUSCLE_Message_create_td_( & timestamp, & data) & @@ -3812,6 +3820,7 @@ end function LIBMUSCLE_Instance_receive_with_settings_psd_ interface LIBMUSCLE_Message_create module procedure & + LIBMUSCLE_Message_create_t, & LIBMUSCLE_Message_create_td, & LIBMUSCLE_Message_create_tnd, & LIBMUSCLE_Message_create_tds, & @@ -16190,6 +16199,20 @@ function LIBMUSCLE_PortsDescription_get( & end do end function LIBMUSCLE_PortsDescription_get + function LIBMUSCLE_Message_create_t( & + timestamp) + implicit none + real (LIBMUSCLE_real8), intent(in) :: timestamp + type(LIBMUSCLE_Message) :: LIBMUSCLE_Message_create_t + + integer (c_intptr_t) :: ret_val + + ret_val = LIBMUSCLE_Message_create_t_( & + timestamp) + + LIBMUSCLE_Message_create_t%ptr = ret_val + end function LIBMUSCLE_Message_create_t + function LIBMUSCLE_Message_create_td( & timestamp, & data) diff --git a/libmuscle/python/libmuscle/communicator.py b/libmuscle/python/libmuscle/communicator.py index 69272f78..e8ea49d2 100644 --- a/libmuscle/python/libmuscle/communicator.py +++ b/libmuscle/python/libmuscle/communicator.py @@ -36,8 +36,8 @@ class Message: """ # Note: This is for communication with the user, it's not what # actually goes out on the wire, see libmuscle.mcp.Message for that. - def __init__(self, timestamp: float, next_timestamp: Optional[float], - data: MessageObject, + def __init__(self, timestamp: float, next_timestamp: Optional[float] = None, + data: MessageObject = None, settings: Optional[Settings] = None ) -> None: """Create a Message. diff --git a/libmuscle/python/libmuscle/test/test_snapshot.py b/libmuscle/python/libmuscle/test/test_snapshot.py index f459a001..fd84d540 100644 --- a/libmuscle/python/libmuscle/test/test_snapshot.py +++ b/libmuscle/python/libmuscle/test/test_snapshot.py @@ -11,7 +11,7 @@ def snapshot() -> Snapshot: wallclock_time = 15.3 port_message_counts = {'in': [1], 'out': [4], 'muscle_settings_in': [0]} is_final = True - message = Message(1.2, None, 'test_data') + message = Message(1.2, data='test_data') snapshot = MsgPackSnapshot( triggers, wallclock_time, port_message_counts, is_final, message) assert snapshot.triggers == triggers diff --git a/libmuscle/python/libmuscle/test/test_snapshot_manager.py b/libmuscle/python/libmuscle/test/test_snapshot_manager.py index 31423bb0..c53d5f8e 100644 --- a/libmuscle/python/libmuscle/test/test_snapshot_manager.py +++ b/libmuscle/python/libmuscle/test/test_snapshot_manager.py @@ -33,7 +33,7 @@ def test_save_load_snapshot(tmp_path: Path) -> None: assert not snapshot_manager.resuming_from_final() snapshot_manager.save_snapshot( - Message(0.2, None, 'test data'), False, ['test'], 13.0) + Message(0.2, data='test data'), False, ['test'], 13.0) communicator.get_message_counts.assert_called_with() manager.submit_snapshot_metadata.assert_called() @@ -63,7 +63,7 @@ def test_save_load_snapshot(tmp_path: Path) -> None: assert msg.data == 'test data' snapshot_manager2.save_snapshot( - Message(0.6, None, 'test data2'), True, ['test'], 42.2, 1.2) + Message(0.6, data='test data2'), True, ['test'], 42.2, 1.2) instance, metadata = manager.submit_snapshot_metadata.call_args[0] assert instance == instance_id diff --git a/scripts/make_libmuscle_api.py b/scripts/make_libmuscle_api.py index 0b083a2e..a096b262 100755 --- a/scripts/make_libmuscle_api.py +++ b/scripts/make_libmuscle_api.py @@ -755,6 +755,7 @@ def __copy__(self) -> 'Elements': message_desc = Class('Message', None, [ + Constructor([Double('timestamp')], 'create_t'), Constructor([Double('timestamp'), Obj('Data', 'data')], 'create_td'), Constructor( [Double('timestamp'), Double('next_timestamp'), Obj('Data', 'data')], @@ -767,7 +768,7 @@ def __copy__(self) -> 'Elements': Obj('Settings', 'settings')], 'create_tnds'), OverloadSet('create', [ - 'create_td', 'create_tnd', 'create_tds', 'create_tnds']), + 'create_t', 'create_td', 'create_tnd', 'create_tds', 'create_tnds']), Destructor(), MemFun(Double(), 'timestamp'), MemFun(Void(), 'set_timestamp', [Double('timestamp')]), From 6135b9d42a52e278a42834ae34bdfee116d1ed11 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 13 Jan 2023 10:40:11 +0100 Subject: [PATCH 138/183] Replace yatiml references by MUSCLE3 references Fixes #138 --- CONTRIBUTING.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index b14102e9..2014113c 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -15,14 +15,14 @@ The sections below outline the steps in each case. You have a question ******************* -#. use the search functionality `here `_ to see if someone already filed the same issue; +#. use the search functionality `here `_ to see if someone already filed the same issue; #. if your issue search did not yield any relevant results, make a new issue; #. apply the "Question" label; apply other labels when relevant. You think you may have found a bug ********************************** -#. use the search functionality `here `_ to see if someone already filed the same issue; +#. use the search functionality `here `_ to see if someone already filed the same issue; #. if your issue search did not yield any relevant results, make a new issue, making sure to provide enough information to the rest of the community to understand the cause and context of the problem. Depending on the issue, you may want to include: - the `SHA hashcode `_ of the commit that is causing your problem; - some identifying information (name and version number) for dependencies you're using; @@ -35,10 +35,10 @@ You want to make some kind of change to the code base #. (**important**) announce your plan to the rest of the community *before you start working*. This announcement should be in the form of a (new) issue; #. (**important**) wait until some kind of consensus is reached about your idea being a good idea; #. if needed, fork the repository to your own Github profile and create your own feature branch off of the latest master commit. While working on your feature branch, make sure to stay up to date with the master branch by pulling in changes, possibly from the 'upstream' repository (follow the instructions `here `_ and `here `_); -#. make sure the existing tests still work by running ``python setup.py test``; +#. make sure the existing tests still work by running ``make test`` and ``make test_examples``; #. add your own tests (if necessary); #. update or expand the documentation; -#. `push `_ your feature branch to (your fork of) the YAtiML repository on GitHub; +#. `push `_ your feature branch to (your fork of) the MUSCLE3 repository on GitHub; #. create the pull request, e.g. following the instructions `here `_. In case you feel like you've made a valuable contribution, but you don't know how to write or run tests for it, or how to generate the documentation: don't let this discourage you from making the pull request; we can help you! Just go ahead and submit the pull request, but keep in mind that you might be asked to append additional commits to your pull request. From 7fdebc2f9f8abdc940b1cbf425399214226682b3 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Fri, 13 Jan 2023 11:30:44 +0100 Subject: [PATCH 139/183] Mark Message(timestamp) constructor as explicit --- libmuscle/cpp/src/libmuscle/message.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libmuscle/cpp/src/libmuscle/message.hpp b/libmuscle/cpp/src/libmuscle/message.hpp index ab2ec186..c73ad01e 100644 --- a/libmuscle/cpp/src/libmuscle/message.hpp +++ b/libmuscle/cpp/src/libmuscle/message.hpp @@ -21,7 +21,7 @@ class Message { * * @param timestamp Simulation time for which this data is valid. */ - Message(double timestamp); + explicit Message(double timestamp); /** Create a Message. * From f5fc06873db07f7e56994848dc8e19fdb32dafab Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 5 Dec 2022 11:34:58 +0100 Subject: [PATCH 140/183] Add Python checkpointing examples --- docs/source/examples/Makefile | 2 + docs/source/examples/python/Makefile | 1 + .../python/checkpointing_diffusion.py | 119 ++++++++++++++++++ .../examples/python/checkpointing_reaction.py | 57 +++++++++ docs/source/examples/rd_checkpoints.ymmsl | 31 +++++ .../examples/rd_implementations.ymmsl.in | 10 ++ 6 files changed, 220 insertions(+) create mode 100644 docs/source/examples/python/checkpointing_diffusion.py create mode 100644 docs/source/examples/python/checkpointing_reaction.py create mode 100644 docs/source/examples/rd_checkpoints.ymmsl diff --git a/docs/source/examples/Makefile b/docs/source/examples/Makefile index 2c70fe8e..8507eb52 100644 --- a/docs/source/examples/Makefile +++ b/docs/source/examples/Makefile @@ -101,6 +101,7 @@ clean: $(MAKE) -C fortran clean $(MAKE) -C python clean rm -f rd_implementations.ymmsl + rm -rf run_*/ # Tests @@ -108,6 +109,7 @@ clean: .PHONY: test_python test_python: base . python/build/venv/bin/activate && DONTPLOT=1 muscle_manager --start-all rd_implementations.ymmsl rd_python.ymmsl rd_settings.ymmsl + . python/build/venv/bin/activate && DONTPLOT=1 muscle_manager --start-all rd_implementations.ymmsl rd_checkpoints.ymmsl rd_settings.ymmsl make -C python test .PHONY: test_cpp diff --git a/docs/source/examples/python/Makefile b/docs/source/examples/python/Makefile index f87b4616..48f27607 100644 --- a/docs/source/examples/python/Makefile +++ b/docs/source/examples/python/Makefile @@ -11,3 +11,4 @@ test: .PHONY: clean clean: $(MAKE) -C build clean + rm -f *.log diff --git a/docs/source/examples/python/checkpointing_diffusion.py b/docs/source/examples/python/checkpointing_diffusion.py new file mode 100644 index 00000000..067858ed --- /dev/null +++ b/docs/source/examples/python/checkpointing_diffusion.py @@ -0,0 +1,119 @@ +import logging +import os + +import numpy as np + +from libmuscle import Grid, Instance, Message +from ymmsl import Operator + + +def laplacian(Z: np.ndarray, dx: float) -> np.ndarray: + """Calculates the Laplacian of vector Z. + + Args: + Z: A vector representing a series of samples along a line. + dx: The spacing between the samples. + + Returns: + The second spatial derivative of Z. + """ + Zleft = Z[:-2] + Zright = Z[2:] + Zcenter = Z[1:-1] + return (Zleft + Zright - 2. * Zcenter) / dx**2 + + +def diffusion() -> None: + """A simple diffusion model on a 1d grid. + + The state of this model is a 1D grid of concentrations. It sends + out the state on each timestep on `state_out`, and can receive an + updated state on `state_in` at each state update. + """ + logger = logging.getLogger() + instance = Instance({ + Operator.O_I: ['state_out'], + Operator.S: ['state_in'], + Operator.O_F: ['final_state_out']}) + + while instance.reuse_instance(): + # F_INIT + t_max = instance.get_setting('t_max', 'float') + dt = instance.get_setting('dt', 'float') + x_max = instance.get_setting('x_max', 'float') + dx = instance.get_setting('dx', 'float') + d = instance.get_setting('d', 'float') + + if instance.resuming(): + msg = instance.load_snapshot() + U = msg.data[0].array.copy() + Us = msg.data[1].array.copy() + t_cur = msg.timestamp + + if instance.should_init(): + U = np.zeros(int(round(x_max / dx))) + 1e-20 + U[25] = 2.0 + U[50] = 2.0 + U[75] = 2.0 + Us = U + t_cur = 0.0 + + while t_cur + dt <= t_max: + # O_I + t_next = t_cur + dt + if t_next + dt > t_max: + t_next = None + cur_state_msg = Message(t_cur, t_next, Grid(U, ['x'])) + instance.send('state_out', cur_state_msg) + + # S + msg = instance.receive('state_in', default=cur_state_msg) + if msg.timestamp > t_cur + dt: + logger.warning('Received a message from the future!') + np.copyto(U, msg.data.array) + + dU = np.zeros_like(U) + dU[1:-1] = d * laplacian(U, dx) * dt + dU[0] = dU[1] + dU[-1] = dU[-2] + + U += dU + Us = np.vstack((Us, U)) + t_cur += dt + + if instance.should_save_snapshot(t_cur): + msg = Message(t_cur, None, [Grid(U), Grid(Us)]) + instance.save_snapshot(msg) + + # O_F + final_state_msg = Message(t_cur, None, Grid(U, ['x'])) + instance.send('final_state_out', final_state_msg) + + if 'DONTPLOT' not in os.environ and 'SLURM_NODENAME' not in os.environ: + from matplotlib import pyplot as plt + plt.figure() + plt.imshow( + np.log(Us + 1e-20), + origin='upper', + extent=[ + -0.5*dx, x_max - 0.5*dx, + (t_max - 0.5*dt) * 1000.0, -0.5*dt * 1000.0], + interpolation='none', + aspect='auto' + ) + cbar = plt.colorbar() + cbar.set_label('log(Concentration)', rotation=270, labelpad=20) + plt.xlabel('x') + plt.ylabel('t (ms)') + plt.title('Concentration over time') + plt.show() + + if instance.should_save_final_snapshot(): + msg = Message(t_cur, None, [Grid(U), Grid(Us)]) + instance.save_final_snapshot(msg) + + +if __name__ == '__main__': + logging.basicConfig() + logging.getLogger().setLevel(logging.INFO) + diffusion() diff --git a/docs/source/examples/python/checkpointing_reaction.py b/docs/source/examples/python/checkpointing_reaction.py new file mode 100644 index 00000000..07204e64 --- /dev/null +++ b/docs/source/examples/python/checkpointing_reaction.py @@ -0,0 +1,57 @@ +import logging + +from libmuscle import Grid, Instance, Message +from ymmsl import Operator + + +def reaction() -> None: + """A simple exponential reaction model on a 1D grid. + """ + instance = Instance({ + Operator.F_INIT: ['initial_state'], # list of float + Operator.O_F: ['final_state']}) # list of float + + while instance.reuse_instance(): + t_max = instance.get_setting('t_max', 'float') + dt = instance.get_setting('dt', 'float') + k = instance.get_setting('k', 'float') + + if instance.resuming(): + msg = instance.load_snapshot() + if msg.data is not None: + # A final snapshot does not have data in it, but that's fine: we + # will do the F_INIT step inside `should_init()` below. + U = msg.data[0].array.copy() + t_cur = msg.timestamp + t_stop = msg.data[1] + + # F_INIT + if instance.should_init(): + msg = instance.receive('initial_state') + U = msg.data.array.copy() + t_cur = msg.timestamp + t_stop = msg.timestamp + t_max + + while t_cur + dt < t_stop: + # O_I + + # S + U += k * U * dt + t_cur += dt + + if instance.should_save_snapshot(t_cur): + instance.save_snapshot(Message(t_cur, None, [ + Grid(U, ['x']), + t_stop])) + + # O_F + instance.send('final_state', Message(t_cur, None, Grid(U, ['x']))) + + if instance.should_save_final_snapshot(): + instance.save_final_snapshot(Message(t_cur, None, None)) + + +if __name__ == '__main__': + logging.basicConfig() + logging.getLogger().setLevel(logging.INFO) + reaction() diff --git a/docs/source/examples/rd_checkpoints.ymmsl b/docs/source/examples/rd_checkpoints.ymmsl new file mode 100644 index 00000000..82dfd638 --- /dev/null +++ b/docs/source/examples/rd_checkpoints.ymmsl @@ -0,0 +1,31 @@ +ymmsl_version: v0.1 + +model: + name: checkpointing_reaction_diffusion_python + + components: + macro: + implementation: checkpointing_diffusion_python + ports: + o_i: state_out + s: state_in + + micro: + implementation: checkpointing_reaction_python + ports: + f_init: initial_state + o_f: final_state + + conduits: + macro.state_out: micro.initial_state + micro.final_state: macro.state_in + +resources: + macro: + threads: 1 + micro: + threads: 1 + +checkpoints: + simulation_time: + - every: 2.0e-05 diff --git a/docs/source/examples/rd_implementations.ymmsl.in b/docs/source/examples/rd_implementations.ymmsl.in index 4f2b0c7b..92cc2b8d 100644 --- a/docs/source/examples/rd_implementations.ymmsl.in +++ b/docs/source/examples/rd_implementations.ymmsl.in @@ -62,3 +62,13 @@ implementations: env: +LD_LIBRARY_PATH: :MUSCLE3_HOME/lib executable: MUSCLE3_EXAMPLES/fortran/build/load_balancer + + checkpointing_reaction_python: + virtual_env: MUSCLE3_EXAMPLES/python/build/venv + executable: python + args: MUSCLE3_EXAMPLES/python/checkpointing_reaction.py + + checkpointing_diffusion_python: + virtual_env: MUSCLE3_EXAMPLES/python/build/venv + executable: python + args: MUSCLE3_EXAMPLES/python/checkpointing_diffusion.py From 241707ff4d215bec29027772ce6105a8acafc6c4 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 5 Dec 2022 14:22:34 +0100 Subject: [PATCH 141/183] Fix --run-dir ignored when not using --start-all --- muscle3/muscle_manager.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/muscle3/muscle_manager.py b/muscle3/muscle_manager.py index 02d30803..b0bed73a 100644 --- a/muscle3/muscle_manager.py +++ b/muscle3/muscle_manager.py @@ -1,4 +1,4 @@ -from datetime import datetime, timezone +from datetime import datetime from pathlib import Path import sys from typing import Optional, Sequence @@ -68,12 +68,15 @@ def manage_simulation( else: run_dir_path = Path(run_dir).resolve() + run_dir_obj = RunDir(run_dir_path) if start_all: - run_dir_obj = RunDir(run_dir_path) manager = Manager(configuration, run_dir_obj, log_level) manager.start_instances() else: - manager = Manager(configuration, None, log_level) + if run_dir is None: + manager = Manager(configuration, None, log_level) + else: + manager = Manager(configuration, run_dir_obj, log_level) print(manager.get_server_location()) success = manager.wait() From 352cffd2f74823e259cd01651671a8e78f964f25 Mon Sep 17 00:00:00 2001 From: Maarten Sebregts Date: Mon, 5 Dec 2022 16:43:43 +0100 Subject: [PATCH 142/183] First draft of checkpointing user documentation. --- docs/source/checkpointing.rst | 539 ++++++++++++++++++++++ docs/source/examples/rd_checkpoints.ymmsl | 2 + docs/source/index.rst | 1 + 3 files changed, 542 insertions(+) create mode 100644 docs/source/checkpointing.rst diff --git a/docs/source/checkpointing.rst b/docs/source/checkpointing.rst new file mode 100644 index 00000000..62f9952a --- /dev/null +++ b/docs/source/checkpointing.rst @@ -0,0 +1,539 @@ +Simulation checkpoints +====================== + +When you execute a long-running simulation, it can be very helpful to store the +state of a simulation at certain intervals. For example, your simulation running +on a HPC cluster may crash, just before it's finished, due to insufficient +memory available. Instead of restarting this simulation from scratch, you could +restart it -- with an increased memory allocation -- from a checkpoint, which +would save a lot of compute time! + +Checkpointing in distributed simulations is difficult. Fortunately, MUSCLE3 +comes with built-in checkpointing support. This page describes in detail how to +use the MUSCLE3 checkpointing API, how to specify checkpoints in the workflow +configuration and how to resume a workflow. + +In the :ref:`user tutorial`, you can read about the checkpointing concepts and +how to use the API when running and resuming MUSCLE3 simulations. This is +followed by a :ref:`developer tutorial`, which explains how to add checkpointing +capabilities to your MUSCLE3 component. Finally, the :ref:`checkpointing +deep-dive` describes in detail the (inner) working of checkpointing in MUSCLE3; +though this level of detail is not required for general usage of the API. + + +Glossary +-------- + +.. glossary:: + + Checkpoint + A checkpoint is a moment during the workflow where the user wants + to have the state of the whole workflow stored. + + Snapshot + A snapshot is the stored state of an instance in the workflow. + + Workflow snapshot + A workflow snapshot is a collection of :term:`snapshots` for + all instances in the workflow, which can be resumed from. This means + that the snapshots of every combination of :term:`peer instances` must + be :ref:`consistent `. + + Peer instances + Two instances that are connected by a Conduit. + + +User tutorial +------------- + + +Defining checkpoints +```````````````````` + +The first step for using checkpoints is to define checkpoints in your workflow. +The checkpoint definitions are for your whole workflow, and you can specify them +in yMMSL as in the following example: + +.. code-block:: yaml + :caption: Example checkpoint definition in yMMSL. + + checkpoints: + at_end: true + simulation_time: + - every: 10 + start: 0 + stop: 100 + - every: 20 + start: 100 + wallclock_time: + - every: 3600 + - at: + - 300 + - 600 + - 1800 + +Let's break this down: the first element in this example ``checkpoints`` +definition is ``at_end``. When this is set to ``true`` (as in the example), it +means that every instance in the workflow will create a snapshot just before the +workflow finishes. This set of snapshots can be used to resume a simulation near +the end and, for example, let it run for a longer time. Some caveats apply, +though, see :ref:`resuming from *at_end* snapshots` for full details. + +The other two items in the ``checkpoints`` definition are the time-based +:ref:`simulation time` and +:ref:`wallclock time`. You can use two types of +rules to set checkpoint moments for these: + +.. _at checkpoint rule: + +#. ``at`` rules define specific moments. The example rule above request a + checkpoint to be taken at 300, 600 and 1800 seconds after the start of the + simulation. You can define multiple times in one ``at`` rule, but you may + also add multiple ``at`` rules. The following definitions are all equivalent: + + .. tabs:: + + .. tab:: Standard + + .. code-block:: yaml + + checkpoints: + wallclock_time: + - at: + - 300 + - 600 + - 1800 + + .. tab:: Inline list + + .. code-block:: yaml + + checkpoints: + wallclock_time: + - at: [300, 600, 1800] + + .. tab:: Multiple ``at`` rules + + .. code-block:: yaml + + checkpoints: + wallclock_time: + - at: 300 + - at: 600 + - at: 1800 + +.. _every checkpoint rule: + +#. ``every`` rules define a recurring set of checkpoints. In the simplest form + you indicate the interval at which checkpoints should be taken -- every hour + in the ``wallclock_time`` example above. You may optionally indicate a + ``start`` or ``stop`` -- as in the ``simulation_time`` example above. + + .. tabs:: + + .. tab:: Simple + + .. code-block:: yaml + :caption: Without ``start`` and ``stop`` indicated, this rule creates a snapshot every hour of elapsed time. + + checkpoints: + wallclock_time: + every: 3600 + + .. tab:: Start and stop + + .. code-block:: yaml + :caption: This combination of rules define a checkpoint at ``t=0``, ``t=10``, ..., until ``t=100``. Afterwards it continues indefinitely every 20 time units (``t=120``, ``t=140``, ...). + + checkpoints: + simulation_time: + - every: 10 + start: 0 + stop: 100 + - every: 20 + start: 100 + + .. tab:: Overlapping ranges + + .. code-block:: yaml + :caption: Overlapping ranges work as well. This combination defines a checkpoint every unit of time (``t=0``, ``t=1``, ...), and additionally at ``t=0.25``, ``t=0.75``, ``t=1.25`` and ``t=1.75``. + + checkpoints: + simulation_time: + - every: 1 + - every: 0.25 + start: 0 + stop: 2 + + .. note:: + + When ``stop`` is specified, the stop time is included when ``stop == + start + n * every``, with ``n`` a positive whole number. However, this + might give surprising results due to the inaccuracies of floating point + computations. Compare for example: + + .. code-block:: yaml + :caption: This specifies a checkpoint at 0, 1, 2, ..., 6 and 7. + + checkpoints: + simulation_time: + - every: 1 + start: 0 + stop: 7 + + .. code-block:: yaml + :caption: However this only checkpoints at 0, 0.1, 0.2, ... 0.5 and 0.6! + + checkpoints: + simulation_time: + - every: 0.1 + start: 0 + stop: 0.7 + + Why the difference? Well - compare in python: + + .. code-block:: python + + >>> 7 * 1.0 + 7.0 + >>> 7 * 0.1 + 0.7000000000000001 + + Since ``0.7000000000000001`` is larger than ``0.7``, no checkpoint will + be generated for this time. + +.. seealso:: + + yMMSL documentation on :external+ymmsl:ref:`Checkpoints` + + yMMSL API reference: :external:py:class:`ymmsl.Checkpoints`, + :external:py:class:`ymmsl.CheckpointAtRule`, + :external:py:class:`ymmsl.CheckpointRangeRule` + + +Simulation time checkpoints +''''''''''''''''''''''''''' + +Checkpoints defined in the ``simulation_time`` section are taken based on the +time inside your simulation. It will only work correctly if all components in +the simulation have a shared concept of time, which only increases during the +simulation. This should be no problem for physics-based simulations, though it +does require that the instances make correct use of the :ref:`timestamp in +MUSCLE3 messages `. When this requirement is fulfilled, +checkpoints based on simulation time are the most reliable way to checkpoint +your workflow. + +MUSCLE3 does not interpret or convert the units that you configure in the +checkpoints. The units are the same as the components in the simulation use for +the timestamps in the messages. Typically this will be in SI seconds, but +components may deviate from this standard. MUSCLE3 assumes that all components +in the workflow use the same time units in the interfaces to libmuscle. + +.. note:: + + MUSCLE3 does not assume anything about the start time of a simulation. Your + simulation time may start at any value, even negative! Therefore, + :ref:`checkpoint ranges ` include 0 and negative + numbers when no ``start`` value is provided. + + Because MUSCLE3 does not know what internal time your simulation starts on, + an ``every`` rule without a ``start`` value will always trigger a checkpoint + at the first possible moment in the simulation. You should supply a + ``start`` value if you do not want this to happen. + + +Wallclock time checkpoints +'''''''''''''''''''''''''' + +Checkpoints defined in the ``wallclock_time`` section are taken based on the +elapsed wallclock time of your simulation (also known as *elapsed real time*). +Each component in the simulation will make a snapshot at the earliest possible +moment after a checkpoint is passed. + +The checkpoint times in the configuration are interpreted as seconds since the +initialization of ``muscle_manager``. + +.. warning:: + + Wallclock time checkpoint definitions are (currently) not a reliable way to + create :term:`workflow snapshots `. While each instance + in the simulation will create a snapshot when requested, there is no + guarantee that all snapshots are :ref:`consistent `. + + When a simulation has relatively simple coupling between components, i.e. + only one peer instance per :external:py:class:`~ymmsl.Operator`, + checkpointing based on wallclock time usually works fine. + + However for co-simulation (the *interact* coupling type) and more complex + coupling, it is likely that not all checkpoints lead to a consistent + :term:`workflow snapshot`. + + +Running a simulation with checkpoints +````````````````````````````````````` + +Starting a simulation with checkpoints is no different than starting one +without. You need to start the ``muscle_manager`` with the configuration yMMSL +file (or files), as well as the individual components (or let ``muscle_manager`` +start them for you with the ``--start-all`` flag). The sole difference is that +the yMMSL configuration must contain a :ref:`checkpoints section `. + +When ``muscle_manager`` is started with checkpoints configured, a couple of +things change. First, **all** of the component implementations **must** support +checkpointing: the simulation will stop with an error if this is not the case. +The simulation may also stop with an error if there is an issue in the +checkpointing implementation of any of the components. + +Second, all components are instructed to make snapshots according to the +configured checkpoints. ``muscle_manager`` keeps track of all created snapshots +during the simulation, looking for :term:`workflow snapshots `. When a workflow snapshot is detected, ``muscle_manager`` writes a +yMMSL file that can be used to :ref:`resume the simulation `. + +During the simulation, all of the created snapshots are stored on the file +system. See below table for the directories where MUSCLE3 stores the files. +Note: a run-directory is automatically created when using the ``--start-all`` +flag for ``muscle_manager``. You may also specify a custom run directory through +the ``--run-dir DIRECTORY`` option. When you do not provide a run directory, the +last column in below table indicates where snapshots are stored. + +.. list-table:: Directories where MUSCLE3 stores snapshot files. + :header-rows: 1 + + * - Snapshot type + - Run directory provided + - No run directory provided + * - Workflow + - ``run_dir/snapshots/`` + - Working directory of ``muscle_manager`` + * - Instance + - ``run_dir/instances//snapshots/``, + + with ```` the name of the instance. + - Working directory of the instance + +.. note:: + + When running a :ref:`distributed simulation ` on + multiple compute nodes, MUSCLE3 assumes that the run directory is accessible + to all nodes (i.e. on a shared or distributed file system). This is usually + the case on HPC clusters. + + +Example: running the reaction-diffusion model with checkpoints +'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' + +The reaction-diffusion example model from the :ref:`Tutorial with Python` also +has a variant with checkpointing enabled. To run this yourself, navigate in a +command line prompt to the ``docs/source/examples`` folder in the MUSCLE3 git +repository. Then execute the following command: + +.. code-block:: bash + + $ mkdir run_rd_example + $ muscle_manager --start-all --run-dir run_rd_example rd_implementations.ymmsl rd_checkpoints.ymmsl rd_settings.ymmsl + +.. note:: + + You may get an error ``File 'rd_implementations.ymmsl' does not exist.`` To + fix this, you need to build the examples in the MUSCLE3 source; in the root + of the git repository, execute: + + .. code-block:: + + $ make test_examples + +Above command runs the ``muscle_manager`` and starts all components (the +reaction model and the diffusion model). The ``rd_checkpoints.ymmsl`` file +contains the checkpoint definitions used in this example: + +.. literalinclude:: examples/rd_checkpoints.ymmsl + :caption: ``docs/source/examples/rd_checkpoints.ymmsl, lines 31-33`` + :lines: 31-33 + :language: yaml + +MUSCLE3 will create the run directory ``run_rd_example`` for you. In it you'll +find the instance snapshots in ``instances/macro/snapshots`` and +``instances/micro/snapshots``. The workflow snapshots are stored in the +``snapshots`` folder in the run directory. + +Resuming a simulation +````````````````````` + +You can resume a simulation from a :term:`workflow snapshot` stored in a +previous run of the simulation. This works by appending a workflow snapshot +yMMSL file from a previous run to the regular yMMSL configuration. If you +started your original simulation with:: + + $ muscle_manager --run-dir ./run1 configuration.ymmsl + +You can resume it from a snapshot of this run like so:: + + $ muscle_manager --run-dir ./run2 configuration.ymmsl ./run1/snapshots/snapshot_20221202_112840.ymmsl + +Here we choose a different run directory, and resume from the snapshot file +``snapshot_20221202_112840.ymmsl`` that was produced by the first run. This file +contains the information required to resume the workflow: + +- It contains a ``description`` which allows you to inspect metadata of the + workflow snapshot. It indicates the trigger or triggers leading to this + snapshot, and some information of the state of each component in the + workflow. This data is for informational purposes only, and ignored by + ``muscle_manager``. +- It also contains the paths to the snapshots that each instance needs to + resume. Note that these snapshots must still exist on the same location. If + you move or delete them (or a parent directory), resuming your simulation + will fail with an error message:: + + Unable to load snapshot: is not a file. Please ensure this path exists and can be read. + + +Example: resuming the reaction-diffusion model +'''''''''''''''''''''''''''''''''''''''''''''' + +To resume the reaction-diffusion model from a snapshot created in the +:ref:`previous section `` and ``