From a97e05becb151accd8ac7e84d0df2de38fc202a0 Mon Sep 17 00:00:00 2001 From: Kent Lee Date: Tue, 12 Mar 2024 02:37:15 +0000 Subject: [PATCH] merged release 0.8 code --- .devcontainer/constraints.txt | 2 +- .devcontainer/requirements.txt | 1 + doc/.gitignore | 2 + doc/cbook/ai-in-the-loop.rst | 45 +- doc/cbook/basic_pandarallel_demo.rst | 38 +- .../bioinfo_alignment_pandarallel_demo.rst | 49 +- doc/cbook/dict_torch_dataset.rst | 83 + doc/cbook/distr-inf-telemetry.rst | 18 +- doc/cbook/dragon_dict.rst | 7 +- doc/cbook/dragon_joblib.rst | 718 +++++++ doc/cbook/torch-scipy-telemetry.rst | 5 +- doc/components/broadcast.rst | 10 +- doc/components/managed_memory/bitset.rst | 13 +- doc/components/managed_memory/heapmanager.rst | 36 +- doc/components/managed_memory/hexdump.rst | 8 +- .../managed_memory/managed_memory.rst | 3 +- .../scalable_locks.rst.needs_work | 13 +- doc/conf.py | 4 +- doc/infrastructure/architecture.rst | 5 +- .../images/overlay_network_fanout.puml | 38 + doc/infrastructure/infrastructure.rst | 4 +- doc/infrastructure/messages_api.rst | 26 +- doc/infrastructure/multi_node_deployment.rst | 16 +- doc/infrastructure/overlay_network.rst | 18 + doc/infrastructure/processes.rst | 5 +- doc/infrastructure/single_node_deployment.rst | 22 +- doc/pguide/owner.rst | 5 +- doc/pguide/stack.rst | 37 +- doc/ref/client/index.rst | 5 +- doc/ref/data/index.rst | 15 + doc/ref/inf/index.rst | 6 +- doc/ref/mpbridge/index.rst | 16 +- doc/ref/native/index.rst | 3 +- doc/ref/ref.rst | 2 + doc/ref/workflows/index.rst | 16 + doc/services/launcher.rst | 28 +- doc/services/local_services.rst | 15 +- doc/start/start.rst | 8 +- doc/uguide/glossary.rst | 3 +- doc/uguide/intro.rst | 5 +- doc/uguide/resource_model.rst | 5 +- doc/uguide/running_dragon.rst | 333 ++- examples/dragon_ai/README.md | 38 + examples/dragon_ai/dict_torch_dataset.py | 269 +++ .../jupyter/doc_ref/basic_pandarallel_demo.py | 101 + ...oinformatics_alignment_pandarallel_demo.py | 202 ++ ...cs_alignment_pandarallel_multinode_demo.py | 182 ++ .../joblib/bench_auto_batching.py | 101 + .../joblib/compressor_comparison.py | 90 + .../joblib/delayed_comparison.py | 41 + .../joblib/memory_basic_usage.py | 85 + .../joblib/nested_parallel_memory.py | 133 ++ .../multiprocessing/joblib/parallel_memmap.py | 76 + .../joblib/parallel_random_state.py | 66 + .../joblib/serialization_and_wrappers.py | 44 + examples/multiprocessing/p2p_lat.py | 10 +- examples/multiprocessing/unittests/common.py | 29 +- .../unittests/test_condition.py | 8 +- .../unittests/test_connection.py | 3 +- .../unittests/test_listener.py | 1 - .../multiprocessing/unittests/test_others.py | 24 +- .../multiprocessing/unittests/test_pool.py | 16 +- .../multiprocessing/unittests/test_process.py | 36 +- .../multiprocessing/unittests/test_queue.py | 39 +- .../unittests/test_shared_memory.py | 2 - examples/workflows/ai-in-the-loop/README.md | 27 +- examples/workflows/parsl/README.md | 12 +- src/dragon/__init__.py | 11 +- src/dragon/ai/__init__.py | 0 src/dragon/ai/torch/__init__.py | 6 + src/dragon/ai/torch/dataloader_monkeypatch.py | 43 + src/dragon/ai/torch/dictdataset.py | 58 + src/dragon/ai/torch/monkeypatching.py | 45 + src/dragon/cli/__init__.py | 2 +- src/dragon/dtypes_inc.pxd | 61 +- src/dragon/globalservices/policy_eval.py | 50 +- src/dragon/globalservices/process.py | 26 +- src/dragon/globalservices/process_int.py | 8 +- src/dragon/infrastructure/facts.py | 14 +- src/dragon/infrastructure/gpu_desc.py | 92 + src/dragon/infrastructure/messages.py | 178 +- src/dragon/infrastructure/node_desc.py | 70 +- src/dragon/infrastructure/parameters.py | 56 +- src/dragon/infrastructure/policy.py | 131 +- src/dragon/infrastructure/util.py | 128 +- src/dragon/launcher/backend.py | 172 +- src/dragon/launcher/dragon_single.py | 2 +- src/dragon/launcher/frontend.py | 126 +- src/dragon/launcher/launch_selector.py | 5 +- src/dragon/launcher/network_config.py | 4 +- src/dragon/launcher/wlm/base.py | 158 +- src/dragon/launcher/wlm/pbs_pals.py | 8 + src/dragon/launcher/wlm/slurm.py | 13 +- src/dragon/launcher/wlm/ssh.py | 7 +- src/dragon/localservices/local_svc.py | 26 +- src/dragon/localservices/server.py | 41 +- src/dragon/mpbridge/pool.py | 142 +- src/dragon/native/pool.py | 5 +- src/dragon/native/process_group.py | 69 +- src/dragon/native/queue.py | 7 +- src/dragon/pydragon_channels.pyx | 2 +- src/dragon/pydragon_fli.pyx | 541 +++++ src/dragon/pydragon_heap.pyx | 2 +- src/dragon/pydragon_heapmanager.pyx | 6 +- src/dragon/pydragon_lock.pyx | 6 +- src/dragon/pydragon_managed_memory.pyx | 4 +- src/dragon/pydragon_pmod.pyx | 2 +- src/dragon/pydragon_utils.pyx | 2 +- src/dragon/transport/tcp/__main__.py | 4 + src/dragon/transport/tcp/agent.py | 37 + src/dragon/transport/tcp/client.py | 14 + src/include/Makefile | 3 +- src/include/dragon/channels.h | 3 + src/include/dragon/fli.h | 698 +++++++ src/include/dragon/global_types.h | 20 + src/include/dragon/managed_memory.h | 11 + src/include/dragon/return_codes.h | 2 + src/include/dragon/utils.h | 8 +- src/lib/_channels.h | 2 - src/lib/_fli.h | 95 + src/lib/bcast.c | 17 +- src/lib/channels.c | 73 +- src/lib/channels_messages.c | 2 +- src/lib/err.h | 1 + src/lib/fli.c | 1831 +++++++++++++++++ src/lib/heap_manager.c | 4 +- src/lib/managed_memory.c | 220 +- src/lib/pals.c | 43 +- src/lib/shared_lock.c | 168 +- src/lib/shared_lock.h | 15 +- src/lib/utils.c | 59 +- src/lmod/dragon-dev.lua | 4 +- src/lmod/dragon.lua | 2 +- src/modulefiles/dragon | 2 +- src/modulefiles/dragon-dev | 6 +- src/pkg/CHANGELOG.md | 65 + src/pkg/INSTALL.md | 4 +- src/pkg/Makefile | 4 +- src/pkg/README.md | 18 +- src/pkg/RELEASE_NOTES.md | 27 +- src/setup.py | 26 +- src/tools/dragon-cleanup | 3 +- test/ai/torch/README.md | 20 + test/ai/torch/test_pytorch_patches.py | 90 + test/channels_subtests/.gitignore | 1 + test/channels_subtests/Makefile | 7 +- test/channels_subtests/test_basic_channels.py | 53 +- test/channels_subtests/test_fli.c | 708 +++++++ test/channels_subtests/test_fli.py | 258 +++ test/channels_subtests/test_peek_pop.c | 3 - test/infrastructure/test_gpu_desc.py | 40 + test/infrastructure/test_policy.py | 35 + test/launcher/backend_testing_mocks.py | 75 +- test/launcher/frontend_testing_mocks.py | 28 +- test/launcher/test_backend_bringup.py | 27 +- test/launcher/test_frontend_bringup.py | 75 +- test/launcher/test_signal_handling.py | 12 +- test/multi-node/test_array.py | 25 +- test/multi-node/test_dict.py | 5 +- test/multi-node/test_process_group.py | 15 + test/native/test_process_group.py | 1 - test/release/test_scipy_img_scale.sh | 34 + test/test_integration_shep_gs.py | 20 +- test/test_shepherd.py | 69 +- test/transport/tcp/test_address.py | 2 +- test/transport/test_lsif.py | 12 +- test/utils/test_attach.c | 7 - test/utils/test_basic_mempool.py | 4 +- 168 files changed, 9704 insertions(+), 1068 deletions(-) create mode 100644 doc/cbook/dict_torch_dataset.rst create mode 100644 doc/cbook/dragon_joblib.rst create mode 100644 doc/infrastructure/images/overlay_network_fanout.puml create mode 100644 doc/infrastructure/overlay_network.rst create mode 100644 doc/ref/data/index.rst create mode 100644 doc/ref/workflows/index.rst create mode 100644 examples/dragon_ai/README.md create mode 100644 examples/dragon_ai/dict_torch_dataset.py create mode 100644 examples/jupyter/doc_ref/basic_pandarallel_demo.py create mode 100644 examples/jupyter/doc_ref/bioinformatics_alignment_pandarallel_demo.py create mode 100644 examples/jupyter/doc_ref/bioinformatics_alignment_pandarallel_multinode_demo.py create mode 100644 examples/multiprocessing/joblib/bench_auto_batching.py create mode 100644 examples/multiprocessing/joblib/compressor_comparison.py create mode 100644 examples/multiprocessing/joblib/delayed_comparison.py create mode 100644 examples/multiprocessing/joblib/memory_basic_usage.py create mode 100644 examples/multiprocessing/joblib/nested_parallel_memory.py create mode 100644 examples/multiprocessing/joblib/parallel_memmap.py create mode 100644 examples/multiprocessing/joblib/parallel_random_state.py create mode 100644 examples/multiprocessing/joblib/serialization_and_wrappers.py create mode 100644 src/dragon/ai/__init__.py create mode 100644 src/dragon/ai/torch/__init__.py create mode 100644 src/dragon/ai/torch/dataloader_monkeypatch.py create mode 100644 src/dragon/ai/torch/dictdataset.py create mode 100644 src/dragon/ai/torch/monkeypatching.py create mode 100644 src/dragon/infrastructure/gpu_desc.py create mode 100644 src/dragon/pydragon_fli.pyx create mode 100644 src/include/dragon/fli.h create mode 100644 src/lib/_fli.h create mode 100644 src/lib/fli.c create mode 100644 src/pkg/CHANGELOG.md create mode 100644 test/ai/torch/README.md create mode 100644 test/ai/torch/test_pytorch_patches.py create mode 100644 test/channels_subtests/test_fli.c create mode 100644 test/channels_subtests/test_fli.py create mode 100644 test/infrastructure/test_gpu_desc.py create mode 100644 test/infrastructure/test_policy.py create mode 100755 test/release/test_scipy_img_scale.sh diff --git a/.devcontainer/constraints.txt b/.devcontainer/constraints.txt index e23a787..a050090 100644 --- a/.devcontainer/constraints.txt +++ b/.devcontainer/constraints.txt @@ -1,5 +1,5 @@ alabaster==0.7.12 -attrs==22.1.0 +attrs==23.1.0 Babel==2.11.0 black==22.10.0 breathe==4.34.0 diff --git a/.devcontainer/requirements.txt b/.devcontainer/requirements.txt index 2b60a3f..2625ba3 100644 --- a/.devcontainer/requirements.txt +++ b/.devcontainer/requirements.txt @@ -17,3 +17,4 @@ sphinx-copybutton vacuum wheel jupyter +parsl diff --git a/doc/.gitignore b/doc/.gitignore index 38a0689..2086dbe 100644 --- a/doc/.gitignore +++ b/doc/.gitignore @@ -5,6 +5,8 @@ ref/client/dragon*.rst ref/inf/dragon*.rst ref/native/Python/dragon*.rst ref/mpbridge/dragon*.rst +ref/data/dragon*.rst +ref/workflows/dragon*.rst ref/native/Python/dragon*.rst *.svg internal/services/transport_agent/tcp/* diff --git a/doc/cbook/ai-in-the-loop.rst b/doc/cbook/ai-in-the-loop.rst index a25b9c4..9daa463 100755 --- a/doc/cbook/ai-in-the-loop.rst +++ b/doc/cbook/ai-in-the-loop.rst @@ -1,35 +1,36 @@ AI-in-the-loop Workflow +++++++++++++++++++++++++++++++++++++++++++++++++++ -This is an example of how Dragon can be used to execute an AI-in-the-loop workflow. -Inspiration for this demo comes from the NERSC-10 Workflow Archetypes White Paper. -This workflow most closely resembles the workflow scenario given as part of archetype four. - -In this example we use a small model implemented in PyTorch to compute an approximation to :math:`\sin(x)`. -In parallel to doing the inference with the model, we launch `sim-cheap` on four MPI ranks. -This MPI job computes the Taylor approximation to :math:`\sin(x)` and compares this with the output of the model. -If the difference is less than 0.05 we consider the model's approximation to be sufficiently accurate and print out the result with the exact result. -If the difference is larger than 0.05 we consider this a failure and re-train the model on a new set of data. - -To generate this data we launch `sim-expensive`. -This MPI job is launched on eight ranks-per-node and each rank generates 32 data points of the form :math:`(x, \sin(x))` where :math:`x \in X \tilde U(-\pi, \pi)`. -This data is aggregated into a PyTorch tensor and then used to train the model. -We then re-evaluate the re-trained model and decide if we need to re-train again or if the estimate is sufficiently accurate. +This is an example of how Dragon can be used to execute an AI-in-the-loop workflow. +Inspiration for this demo comes from the NERSC-10 Workflow Archetypes White Paper. +This workflow most closely resembles the workflow scenario given as part of archetype four. + +In this example we use a small model implemented in PyTorch to compute an approximation to :math:`\sin(x)`. +In parallel to doing the inference with the model, we launch `sim-cheap` on four MPI ranks. +This MPI job computes the Taylor approximation to :math:`\sin(x)` and compares this with the output of the model. +If the difference is less than 0.05 we consider the model's approximation to be sufficiently accurate and print out the result with the exact result. +If the difference is larger than 0.05 we consider this a failure and re-train the model on a new set of data. + +To generate this data we launch `sim-expensive`. +This MPI job is launched on eight ranks-per-node and each rank generates 32 data points of the form :math:`(x, \sin(x))` where :math:`x \in U(-\pi, \pi)`. +This data is aggregated into a PyTorch tensor and then used to train the model. +We then re-evaluate the re-trained model and decide if we need to re-train again or if the estimate is sufficiently accurate. We continue this loop until we've had five successes. -Figure 1 presents the structure of this main loop. It shows when each MPI application is launched and what portions are executed in parallel. +:numref:`ai-in-the-loop` presents the structure of this main loop. It shows when each MPI application is launched and what portions are executed in parallel. -.. figure:: images/ai-in-the-loop-workflow.jpg - :scale: 30% +.. figure:: images/ai-in-the-loop-workflow.jpg + :scale: 100% + :name: ai-in-the-loop - **Figure 1: Example AI-in-the-loop workflow ** + **Example AI-in-the-loop workflow** This example consists of the following python files: -* `ai-in-the-loop.py` - This is the main file. It contains functions for launching both MPI executables and parsing the results as well as imports functions defined in `model.py` and coordinates the model inference and training with the MPI jobs. +* `ai-in-the-loop.py` - This is the main file. It contains functions for launching both MPI executables and parsing the results as well as imports functions defined in `model.py` and coordinates the model inference and training with the MPI jobs. -* `model.py` - This file defines the model and provides some functions for model training and inference. +* `model.py` - This file defines the model and provides some functions for model training and inference. Below, we present the main python code (`ai-in-the-loop.py`) which acts as the coordinator of the workflow. The code of the other files can be found in the release package, inside `examples/workflows/ai-in-the-loop` directory. @@ -259,7 +260,7 @@ The code of the other files can be found in the release package, inside `example Installation ============ -After installing dragon, the only other dependency is on PyTorch. The PyTorch version and corresponding pip command can be found here (https://pytorch.org/get-started/locally/). +After installing dragon, the only other dependency is on PyTorch. The PyTorch version and corresponding pip command can be found here (https://pytorch.org/get-started/locally/). ``` > pip install torch torchvision torchaudio @@ -282,7 +283,7 @@ Example Output when run on 16 nodes with 8 MPI ranks-per-node used to generate d > make gcc -g -pedantic -Wall -I /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/include -L /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/lib -c -o sim-cheap.o sim-cheap.c gcc -g -pedantic -Wall -I /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/include -L /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/lib sim-cheap.o -o sim-cheap -lm -L /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/lib -lmpich - gcc -g -pedantic -Wall -I /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/include -L /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/lib -c -o sim-expensive.o + gcc -g -pedantic -Wall -I /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/include -L /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/lib -c -o sim-expensive.o gcc -g -pedantic -Wall -I /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/include -L /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/lib sim-expensive.o -o sim-expensive -lm -L /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/lib -lmpich > salloc --nodes=16 --exclusive > dragon ai-in-the-loop.py diff --git a/doc/cbook/basic_pandarallel_demo.rst b/doc/cbook/basic_pandarallel_demo.rst index 04042d1..4feb65e 100644 --- a/doc/cbook/basic_pandarallel_demo.rst +++ b/doc/cbook/basic_pandarallel_demo.rst @@ -1,8 +1,8 @@ Basic Pandarallel Demonstration for Single Node Environment ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -This Jupyter benchmark is a simple use case for the pandarallel `parallel_apply` call. -It can be run with `dragon` and base multiprocessing to compare performance on your machine. +This Jupyter benchmark is a simple use case for the pandarallel `parallel_apply` call. +It can be run with `dragon` and base multiprocessing to compare performance on your machine. The program demonstrates how to use `parallel_apply`, the multiprocessing verison of pandas `apply`, on a pandas dataframe with random input. @@ -12,36 +12,4 @@ The code demonstrates the following key concepts working with Dragon: * How to use pandarallel and pandas with Dragon and base multiprocessing * How pandarallel handles various dtypes -.. code-block:: python - :linenos: - :caption: **basic_pandarallel_demo.ipynb: A bioinformatics benchmark for aligning nucleotide sequences and amino acid sequences** - - import dragon - import multiprocessing - - import cloudpickle - - import numpy as np - import pandas as pd - - import pandarallel; pandarallel.__version__ - - multiprocessing.set_start_method("dragon") - pandarallel.core.dill = cloudpickle - pandarallel.core.CONTEXT = multiprocessing.get_context("dragon") - pandarallel.pandarallel.initialize(progress_bar=True) - - num_rows = 10 - - df = pd.DataFrame( - { - "seqnum": np.arange(42, (42 + num_rows), dtype=int), - #"metric_A": np.random.rand(num_rows), - #"metric_B": np.random.rand(num_rows), - "metric_C": np.random.rand(num_rows), - "alt_seq": np.random.randint(low=42, high=(42 + num_rows), size=(num_rows,)), - "label": np.array(list("ATCG"))[np.random.randint(0, 4, num_rows)], - }, - ) - - df['highlow_C'] = df['metric_C'].parallel_apply(lambda x: x < cutoff) +.. literalinclude:: ../../examples/jupyter/doc_ref/basic_pandarallel_demo.py diff --git a/doc/cbook/bioinfo_alignment_pandarallel_demo.rst b/doc/cbook/bioinfo_alignment_pandarallel_demo.rst index a3000e0..467038c 100644 --- a/doc/cbook/bioinfo_alignment_pandarallel_demo.rst +++ b/doc/cbook/bioinfo_alignment_pandarallel_demo.rst @@ -19,43 +19,12 @@ The code demonstrates the following key concepts working with Dragon: * How to utilize pandarallel in a multi-node environment * How to utilize k-means clustering on features such as alignment, E value, and percentage coverage -.. code-block:: python - :linenos: - :caption: **bioinformatics_alignment_pandarallel_demo.ipynb: A bioinformatics benchmark for aligning nucleotide sequences and amino acid sequences** - - import dragon - import multiprocessing - - import cloudpickle - - import os - os.environ['OPENBLAS_NUM_THREADS'] = '1' - - import numpy as np - import pandas as pd - - import Bio - from Bio import SeqIO, Entrez - import pyalign - import time - import matplotlib.pyplot as plt - from sklearn.cluster import KMeans - import seaborn as sns - import pandarallel; pandarallel.__version__ - - multiprocessing.set_start_method("dragon") - pandarallel.core.dill = cloudpickle - pandarallel.core.CONTEXT = multiprocessing.get_context("dragon") - pandarallel.pandarallel.initialize(progress_bar=True) - - start = time.monotonic() - nucl_df['PyAlign Alignment Score'] = nucl_df['Sequence'].parallel_apply(lambda seq2: alignment_algorithm(endo_nucl_seq, seq2, gap=0)) - stop = time.monotonic() - functions, bar_num, tot_time = ['PyAlign Alignment Score'],[128],[stop-start] +The following notebook was used for the single-node comparison: +.. literalinclude:: ../../examples/jupyter/doc_ref/bioinformatics_alignment_pandarallel_demo.py For the single-node run, both base multiprocessing and Dragon are compared. The runs utilized a single node with 2 AMD EPYC 7742 64-Core Processors with 128 cores. -Dragon employs a number of optimizations on base multiprocessing; the Dragon start method outperforms the use of the base multiprocessing spawn start method on the same hardware. +Dragon employs a number of optimizations on base multiprocessing; the Dragon start method outperforms the use of the base multiprocessing spawn start method on the same hardware. The timing for the base multiprocessing runtime is: @@ -102,10 +71,14 @@ The timing for the single-node Dragon runtime is: - - 27.174203 -For multi-node Dragon run, the run was on 2 Apollo nodes. Each Apollo node has 1x AMD Rome CPU with 4x AMD MI100 GPUs and 128 cores. -The multi-node use case scales with the total number of CPUs reported by the allocation. As there are more nodes, workers, and CPUs available for multi-node, Dragon extends -multiprocessing's stock capabilities and demonstrates additional improvement to measured execution time. -Base multiprocessing does not support multi-node workloads. +For multi-node Dragon run, the run was on 2 Apollo nodes. Each Apollo node has 1x AMD Rome CPU with 4x AMD MI100 GPUs and 128 cores. +The multi-node use case scales with the total number of CPUs reported by the allocation. As there are more nodes, workers, and CPUs available for multi-node, Dragon extends +multiprocessing's stock capabilities and demonstrates additional improvement to measured execution time. +Base multiprocessing does not support multi-node workloads. + +The following notebook was used for the multi-node comparison: + +.. literalinclude:: ../../examples/jupyter/doc_ref/bioinformatics_alignment_pandarallel_multinode_demo.py The timing for the multi-node Dragon runtime is: diff --git a/doc/cbook/dict_torch_dataset.rst b/doc/cbook/dict_torch_dataset.rst new file mode 100644 index 0000000..8f3e4bc --- /dev/null +++ b/doc/cbook/dict_torch_dataset.rst @@ -0,0 +1,83 @@ +PyTorch Dataset Usage with Dragon Distributed Dictionary +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +This example shows how a PyTorch dataset can use a Dragon distributed dictionary to store the data. +In principle, the distributed dictionary could be shared among other processes that might interact with the training data between training iterations. +The program must be run with GPUs. + +The code demonstrates how the following key concepts work with Dragon: + +* How to utilize Dragon and the PyTorch dataloader and neural network model for training on GPUs +* How to use the distributed Dragon dictionary with multiprocessing queues + +.. literalinclude:: ../../examples/dragon_ai/dict_torch_dataset.py + +Installation +============ + +After installing dragon, the only other dependency is on PyTorch. The PyTorch version and corresponding pip command can be found here (https://pytorch.org/get-started/locally/). + +``` +> pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 +``` + +Description of the system used +============================== + +For this example, an HPE Cray EX was used. Each node has AMD EPYC 7763 64-core CPUs and 4x Nvidia A100 GPUs. + +How to run +========== + +Example Output when run on 2 nodes with 2 MNIST workers, 1 device per node, 2 epochs, CUDA training, 4 dragon dict managers, and dragon dict memory. +------------------------------------------------------------------------------------- + +.. code-block:: console + :linenos: + + > salloc --nodes=2 -p allgriz --exclusive -t 1:00:00 + > dragon dict_torch_dataset.py --mnist-workers 4 --devices-per-node 1 --epochs 2 + Number of nodes: 2 + Number of MNIST workers: 2 + Number of dragon dict managers: 4 + 100.0% + 100.0% + 100.0% + 100.0% + Rank 0 Train Epoch: 1 [0/60000 (0%)] Loss: 2.316082 + Rank 1 Train Epoch: 1 [0/60000 (0%)] Loss: 2.313832 + Rank 0 Train Epoch: 1 [6400/60000 (11%)] Loss: 0.268168 + Rank 1 Train Epoch: 1 [6400/60000 (11%)] Loss: 0.436355 + Rank 0 Train Epoch: 1 [12800/60000 (21%)] Loss: 0.190972 + Rank 1 Train Epoch: 1 [12800/60000 (21%)] Loss: 0.205474 + Rank 0 Train Epoch: 1 [19200/60000 (32%)] Loss: 0.187326 + Rank 1 Train Epoch: 1 [19200/60000 (32%)] Loss: 0.568415 + Rank 0 Train Epoch: 1 [25600/60000 (43%)] Loss: 0.093499 + Rank 1 Train Epoch: 1 [25600/60000 (43%)] Loss: 0.058430 + Rank 0 Train Epoch: 1 [32000/60000 (53%)] Loss: 0.060121 + Rank 1 Train Epoch: 1 [32000/60000 (53%)] Loss: 0.149605 + Rank 0 Train Epoch: 1 [38400/60000 (64%)] Loss: 0.156384 + Rank 1 Train Epoch: 1 [38400/60000 (64%)] Loss: 0.119814 + Rank 0 Train Epoch: 1 [44800/60000 (75%)] Loss: 0.082197 + Rank 1 Train Epoch: 1 [44800/60000 (75%)] Loss: 0.096987 + Rank 0 Train Epoch: 1 [51200/60000 (85%)] Loss: 0.053689 + Rank 1 Train Epoch: 1 [51200/60000 (85%)] Loss: 0.101078 + Rank 0 Train Epoch: 1 [57600/60000 (96%)] Loss: 0.031515 + Rank 1 Train Epoch: 1 [57600/60000 (96%)] Loss: 0.090198 + Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz + Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./torch-data-dict/data/MNIST/raw/train-images-idx3-ubyte.gz + Extracting ./torch-data-dict/data/MNIST/raw/train-images-idx3-ubyte.gz to ./torch-data-dict/data/MNIST/raw + + Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz + Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./torch-data-dict/data/MNIST/raw/train-labels-idx1-ubyte.gz + Extracting ./torch-data-dict/data/MNIST/raw/train-labels-idx1-ubyte.gz to ./torch-data-dict/data/MNIST/raw + + Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz + Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./torch-data-dict/data/MNIST/raw/t10k-images-idx3-ubyte.gz + Extracting ./torch-data-dict/data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./torch-data-dict/data/MNIST/raw + + Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz + Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./torch-data-dict/data/MNIST/raw/t10k-labels-idx1-ubyte.gz + Extracting ./torch-data-dict/data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./torch-data-dict/data/MNIST/raw + + diff --git a/doc/cbook/distr-inf-telemetry.rst b/doc/cbook/distr-inf-telemetry.rst index 3f9af67..6e32321 100644 --- a/doc/cbook/distr-inf-telemetry.rst +++ b/doc/cbook/distr-inf-telemetry.rst @@ -183,34 +183,36 @@ which we update every second until the end event is set. Note line 17 where we s Examples of input and Output ============================ -Figure 1 provides an example of an input and the response the user receives from the chatbot. - +:numref:`single-prompt-response` provides an example of an input and the response the user receives from the chatbot. .. figure:: images/llm-grafana-single-prompt-response.jpg :scale: 60% + :name: single-prompt-response - **Figure 1: Input prompt and response with IDs for the prompter, inference worker, and response worker** + **Input prompt and response with IDs for the prompter, inference worker, and response worker** To simulate many different users iteracting with a chatbot, we loop over a list of fifteen prompts seven times giving a total of 105 prompts that the four inference workers -to respond to. The input loop and prompts are shown in Figure 2. A sample telemetry output as displayed in Grafana after all these prompts are processed is -shown in Figure 3. Note how the utilization is nearly equal among the GPUs with all starting and ending at the same time. The spikes in utilization prior to +to respond to. The input loop and prompts are shown in :numref:`loop-over-prompts`. A sample telemetry output as displayed in Grafana after all these prompts are processed is +shown in :numref:`node-telemetry` . Note how the utilization is nearly equal among the GPUs with all starting and ending at the same time. The spikes in utilization prior to the running of the many prompts are from the models being loaded onto the GPUs at the start up of the inference workers and the worker that responded to the prompt -in Figure 1. +in :numref:`single-prompt-response`. .. figure:: images/llm-grafana-many-prompts.jpg :scale: 50% + :name: loop-over-prompts - **Figure 2: Loop over list of prompts to simulate many users** + **Loop over list of prompts to simulate many users** .. figure:: images/llm-grafana-telem-data.jpg :scale: 60% + :name: node-telemetry - **Figure 3: Node telemetry data that is visualized using Grafana GUI and highlights the load balanced nature of this example** + **Node telemetry data that is visualized using Grafana GUI and highlights the load balanced nature of this example** diff --git a/doc/cbook/dragon_dict.rst b/doc/cbook/dragon_dict.rst index e83d5b7..a2b67b0 100644 --- a/doc/cbook/dragon_dict.rst +++ b/doc/cbook/dragon_dict.rst @@ -11,8 +11,9 @@ Architecture of Dragon Dictionay .. figure:: images/dragon_dict_architecture.png :align: center :scale: 30% + :name: high-level-arch - **Figure 1: High-level architecture of a Dragon Dictionary** + **High-level architecture of a Dragon Dictionary** From Python code, a user instantiates a dragon dictionary specifying the number of back-end managers. During bring-up of a dragon dictionary, a pool of manager processes are started along with a collection of dragon channels used for communication between clients and managers. Each of the managers @@ -95,9 +96,9 @@ The dictionary is spawned from across 1 node to 64 nodes with each manager worke with each key of constant size of 30 bytes in the dictionary. The results clearly demonstrate the advantage of distributed dictionary, with increased aggregated rate of opearations as the dictionary managers are spawned across the increasing number of nodes. - .. figure:: images/dragon_dict_results.png :align: center :scale: 25% + :name: multinode-results - **Figure 2: Results on a multi-node setup** + **Results on a multi-node setup** diff --git a/doc/cbook/dragon_joblib.rst b/doc/cbook/dragon_joblib.rst new file mode 100644 index 0000000..e33b672 --- /dev/null +++ b/doc/cbook/dragon_joblib.rst @@ -0,0 +1,718 @@ +Dragon JobLib Examples +++++++++++++++++++++++ +The subdirectory `JobLib` contains the joblib examples and benchmarks that are compatible with Dragon. + +JobLib is used for on demand computing, transparent parallelization, data tracking, and data flow inspection. +Dragon allows for further optimization of multiprocessing joblib workloads For the multi-node systems. + +The following use cases compare the performance of joblib workloads using `dragon` and base multiprocessing. +It is important to note that the joblib backend must be set to `multiprocessing` for the Dragon package +to work without errors. + +The most interesting use cases involve joblib's `Parallel` function. Most of the use cases build around `Parallel`. `Parallel` allows for readable code with proper argument construction, +informative tracebacks, the ability to turn on and off parallel computing with `n_jobs`, efficient memory usage, and flexible pickling control. + +The code demonstrates the following key concepts working with Dragon: + +* How to write joblib programs that can run with Dragon and base multiprocessing +* A comparison of joblib with `dragon`, base multiprocessing, and multi-node with larger Dragon processes. + +The set up for Single-node run for both base multiprocessing and `dragon`: For the single-node run, both base multiprocessing and Dragon are compared. The runs utilized a Single-node with 2 AMD EPYC 7742 64-Core Processors with 128 cores. +Dragon employs a number of optimizations on base multiprocessing. + +The set up for the multi node run for `dragon`: For the multi-node Dragon run, the run was on 2 Apollo nodes. Each Apollo node has 1x AMD Rome CPU with 4x AMD MI100 GPUs and 128 cores. +The multi-node use case scales with the total number of CPUs reported by the allocation. As there are more nodes, workers, and CPUs available For the multi-node, Dragon extends +multiprocessing's stock capabilities. +Base multiprocessing does not support multi-node workloads. + +In alphabetical order, these are the following joblib use cases and their usefulness: + +.. literalinclude:: ../../examples/multiprocessing/joblib/bench_auto_batching.py + +The timing for the base multiprocessing runtime is: + +.. list-table:: Base Multiprocessing Timings for Bench Auto Batching + :widths: 25 25 25 25 + :header-rows: 1 + + * - n_jobs + - Workload Name + - Number of Tasks + - Time in seconds + * - 2 + - high variance, no trend + - 5000 + - 1.648 + * - 2 + - low variance, no trend + - 5000 + - 1.692 + * - 2 + - cyclic trends + - 300 + - 4.165 + * - 2 + - shuffling of the previous benchmark: same mean and variance + - 300 + - 4.150 + * - 4 + - high variance, no trend + - 5000 + - 1.64 + * - 4 + - low variance, no trend + - 5000 + - 1.42 + * - 4 + - cyclic trends + - 300 + - 2.196 + * - 4 + - shuffling of the previous benchmark: same mean and variance + - 300 + - 2.215 + * - 8 + - high variance, no trend + - 5000 + - 0.908 + * - 8 + - low variance, no trend + - 5000 + - 0.829 + * - 8 + - cyclic trends + - 300 + - 1.382 + * - 8 + - shuffling of the previous benchmark: same mean and variance + - 300 + - 1.227 + * - 16 + - high variance, no trend + - 5000 + - 1.178 + * - 16 + - low variance, no trend + - 5000 + - 0.906 + * - 16 + - cyclic trends + - 300 + - 0.993 + * - 16 + - shuffling of the previous benchmark: same mean and variance + - 300 + - 0.941 + * - 32 + - high variance, no trend + - 5000 + - 1.124 + * - 32 + - low variance, no trend + - 5000 + - 1.122 + * - 32 + - cyclic trends + - 300 + - 0.907 + * - 32 + - shuffling of the previous benchmark: same mean and variance + - 300 + - 0.904 + +The timing for the single-node Dragon runtime is: + +.. list-table:: Dragon Timings for Bench Auto Batching + :widths: 25 25 25 25 + :header-rows: 1 + + * - n_jobs + - Workload Name + - Number of Tasks + - Time in seconds + * - 2 + - high variance, no trend + - 5000 + - 4.445 + * - 2 + - low variance, no trend + - 5000 + - 5.667 + * - 2 + - cyclic trends + - 300 + - 8.669 + * - 2 + - shuffling of the previous benchmark: same mean and variance + - 300 + - 7.27 + * - 4 + - high variance, no trend + - 5000 + - 4.318 + * - 4 + - low variance, no trend + - 5000 + - 3.883 + * - 4 + - cyclic trends + - 300 + - 4.993 + * - 4 + - shuffling of the previous benchmark: same mean and variance + - 300 + - 5.367 + * - 8 + - high variance, no trend + - 5000 + - 4.660 + * - 8 + - low variance, no trend + - 5000 + - 3.926 + * - 8 + - cyclic trends + - 300 + - 4.740 + * - 8 + - shuffling of the previous benchmark: same mean and variance + - 300 + - 4.65 + * - 16 + - high variance, no trend + - 5000 + - 5.451 + * - 16 + - low variance, no trend + - 5000 + - 5.358 + * - 16 + - cyclic trends + - 300 + - 4.446 + * - 16 + - shuffling of the previous benchmark: same mean and variance + - 300 + - 4.361 + * - 32 + - high variance, no trend + - 5000 + - 10.295 + * - 32 + - low variance, no trend + - 5000 + - 18.751 + * - 32 + - cyclic trends + - 300 + - 6.577 + * - 32 + - shuffling of the previous benchmark: same mean and variance + - 300 + - 5.998 + +The timing for the multi-node Dragon runtime is: + +.. list-table:: Multi-node Dragon Timings for Bench Auto Batching + :widths: 25 25 25 25 + :header-rows: 1 + + * - n_jobs + - Workload Name + - Number of Tasks + - Time in seconds + * - 2 + - high variance, no trend + - 5000 + - 6.007959 + * - 2 + - low variance, no trend + - 5000 + - 8.862581 + * - 2 + - cyclic trends + - 300 + - 8.567808 + * - 2 + - shuffling of the previous benchmark: same mean and variance + - 300 + - 8.607972 + * - 4 + - high variance, no trend + - 5000 + - 6.007959 + * - 4 + - low variance, no trend + - 5000 + - 8.862581 + * - 4 + - cyclic trends + - 300 + - 8.567808 + * - 4 + - shuffling of the previous benchmark: same mean and variance + - 300 + - 8.607972 + * - 8 + - high variance, no trend + - 5000 + - 7.252201 + * - 8 + - low variance, no trend + - 5000 + - 6.686624 + * - 8 + - cyclic trends + - 300 + - 6.242919 + * - 8 + - shuffling of the previous benchmark: same mean and variance + - 300 + - 6.843477 + * - 16 + - high variance, no trend + - 5000 + - 7.252201 + * - 16 + - low variance, no trend + - 5000 + - 6.686624 + * - 16 + - cyclic trends + - 300 + - 6.242919 + * - 16 + - shuffling of the previous benchmark: same mean and variance + - 300 + - 6.843477 + * - 32 + - high variance, no trend + - 5000 + - 7.252201 + * - 32 + - low variance, no trend + - 5000 + - 6.686624 + * - 32 + - cyclic trends + - 300 + - 6.242919 + * - 32 + - shuffling of the previous benchmark: same mean and variance + - 300 + - 6.843477 + +.. literalinclude:: ../../examples/multiprocessing/joblib/compressor_comparison.py + +The timing for the base multiprocessing runtime is: + +.. list-table:: Base Multiprocessing Timings for Compressor Comparison + :widths: 25 25 + :header-rows: 1 + + * - n_jobs + - Statistics + * - Raw dump duration + - 1.458s + * - Raw dump file size + - 167.218MB + * - Raw load duration + - 0.061s + * - Zlib dump duration + - 0.624s + * - Zlib file size + - 3.943MB + * - Zlib load duration + - 0.210s + * - LZMA dump duration + - 1.640s + * - LZMA file size + - 2.118MB + * - LZMA load duration + - 0.349s + * - LZ4 file size + - 2.118MB + * - LZMA load duration + - 0.331s + +The timing for the single-node Dragon runtime is: + +.. list-table:: Dragon Timings for Compressor Comparison + :widths: 25 25 + :header-rows: 1 + + * - n_jobs + - Statistics + * - Raw dump duration + - 1.454s + * - Raw dump file size + - 167.218MB + * - Raw load duration + - 0.062s + * - Zlib dump duration + - 0.640s + * - Zlib file size + - 3.943MB + * - Zlib load duration + - 0.218s + * - LZMA dump duration + - 1.639s + * - LZMA file size + - 2.118MB + * - LZMA load duration + - 0.348s + * - LZ4 file size + - 2.118MB + * - LZMA load duration + - 0.334s + +The timing for the multi-node Dragon runtime is: + +.. list-table:: Multi-node Dragon Timings for Compressor Comparison + :widths: 25 25 + :header-rows: 1 + + * - n_jobs + - Statistics + * - Raw dump duration + - 1.577s + * - Raw dump file size + - 167.218MB + * - Raw load duration + - 1.483s + * - Zlib dump duration + - 0.883s + * - Zlib file size + - 3.943MB + * - Zlib load duration + - 0.275s + * - LZMA dump duration + - 2.098s + * - LZMA file size + - 2.118MB + * - LZMA load duration + - 0.420s + * - LZ4 file size + - 2.118MB + * - LZMA load duration + - 0.414s + +.. literalinclude:: ../../examples/multiprocessing/joblib/delayed_comparison.py + +The timing for the base multiprocessing runtime is: + +.. list-table:: Base Multiprocessing Timings for Delayed Comparison + :widths: 25 25 + :header-rows: 1 + + * - Type of parallel run + - Time in seconds + * - Without delayed + - 10.75817883014679 + * - With delayed + - 0.010308943688869476 + +The timing for the single-node Dragon runtime is: + +.. list-table:: Dragon Timings for Delayed Comparison + :widths: 25 25 + :header-rows: 1 + + * - Type of parallel run + - Time in seconds + * - Without delayed + - 10.73451592773199 + * - With delayed + - 0.010201960802078247 + +The timing for the multi-node Dragon runtime is: + +.. list-table:: Multi-node Dragon Timings for Delayed Comparison + :widths: 25 25 + :header-rows: 1 + + * - Type of parallel run + - Time in seconds + * - Without delayed + - 10.547747920732945 + * - With delayed + - 0.015844576992094517 + +.. literalinclude:: ../../examples/multiprocessing/joblib/memory_basic_usage.py + +The timing for the base multiprocessing runtime is: + +.. list-table:: Base Multiprocessing Timings for Memory Basic Usage + :widths: 25 25 + :header-rows: 1 + + * - Type of parallel run + - Time in seconds + * - First transformation + - 5.01 + * - Second transformation + - 5.08 + * - Third transformation + - 0.01 + * - Fourth transformation + - 5.09 + * - Fifth transformation + - 0.01 + +The timing for the single-node Dragon runtime is: + +.. list-table:: Single-node Dragon Runtimes for Memory Basic Usage + :widths: 25 25 + :header-rows: 1 + + * - Type of parallel run + - Time in seconds + * - First transformation + - 5.01 + * - Second transformation + - 5.06 + * - Third transformation + - 0.01 + * - Fourth transformation + - 5.07 + * - Fifth transformation + - 0.01 + +The timing for the multi-node Dragon runtime is: + +.. list-table:: Multi-node Dragon Runtimes for Memory Basic Usage + :widths: 25 25 + :header-rows: 1 + + * - Type of parallel run + - Time in seconds + * - First transformation + - 5.00 + * - Second transformation + - 5.06 + * - Third transformation + - 0.02 + * - Fourth transformation + - 5.12 + * - Fifth transformation + - 0.02 + +.. literalinclude:: ../../examples/multiprocessing/joblib/nested_parallel_memory.py + +The timing for the base multiprocessing runtime is: + +.. list-table:: Base Multiprocessing Timings for Nested Parallel Memory + :widths: 25 25 + :header-rows: 1 + + * - Process step + - Time in seconds + * - First sequential processing + - 8.01 + * - First round - caching the data + - 4.09 + * - Second round - reloading the cache + - 0.05 + * - Reusing intermediate checkpoints + - 0.04 + * - Second sequential processing + - 8.01 + * - First round - caching the data + - 4.12 + * - Second round - reloading the cache + - 0.05 + * - Reusing intermediate checkpoints + - 0.04 + +The timing for the single-node Dragon runtime is: + +.. list-table:: Single-node Dragon Timings for Nested Parallel Memory + :widths: 25 25 + :header-rows: 1 + + * - Process step + - Time in seconds + * - First sequential processing + - 8.01 + * - First round - caching the data + - 6.96 + * - Second round - reloading the cache + - 3.18 + * - Reusing intermediate checkpoints + - 3.18 + * - Second sequential processing + - 8.01 + * - First round - caching the data + - 7.17 + * - Second round - reloading the cache + - 3.16 + * - Reusing intermediate checkpoints + - 2.66 + +The timing for the multi-node Dragon runtime is: + +.. list-table:: Multi-node Dragon Timings for Nested Parallel Memory + :widths: 25 25 + :header-rows: 1 + + * - Process step + - Time in seconds + * - First sequential processing + - 8.01 + * - First round - caching the data + - 6.96 + * - Second round - reloading the cache + - 3.18 + * - Reusing intermediate checkpoints + - 3.18 + * - Second sequential processing + - 8.01 + * - First round - caching the data + - 7.17 + * - Second round - reloading the cache + - 3.16 + * - Reusing intermediate checkpoints + - 2.66 + +.. literalinclude:: ../../examples/multiprocessing/joblib/parallel_memmap.py + +The timing for the base multiprocessing runtime is: + +.. list-table:: Base Multiprocessing Timings for Parallel Memory Map + :widths: 25 25 + :header-rows: 1 + + * - Process step + - Time in seconds + * - First elapsed time computing average of slices + - 0.98 + * - Second elapsed time computing average of slices + - 3.93 + * - Third elapsed time computing average of slices + - 6.82 + +The timing for the single-node Dragon runtime + +.. list-table:: Single-node Dragon Timings for Parallel Memory Map + :widths: 25 25 + :header-rows: 1 + + * - Process step + - Time in seconds + * - First elapsed time computing average of slices + - 0.99 + * - Second elapsed time computing average of slices + - 4.15 + * - Third elapsed time computing average of slices + - 5.28 + +The timing for the multi-node Dragon runtime + +.. list-table:: Multi-node Dragon Timings for Parallel Memory Map + :widths: 25 25 + :header-rows: 1 + + * - Process step + - Time in seconds + * - First elapsed time computing average of slices + - 0.97 + * - Second elapsed time computing average of slices + - 4.89 + * - Third elapsed time computing average of slices + - 6.87 + + +.. literalinclude:: ../../examples/multiprocessing/joblib/parallel_random_state.py + + The timing for the base multiprocessing runtime is: + +.. list-table:: Base Multiprocessing Timings for Parallel Random State + :widths: 25 25 + :header-rows: 1 + + * - Process step + - Time in seconds + * - First iteratation (generation of stochastic vector) + - 0.02696242928504944 + * - Second iteratation (replacement of stochastic vector) + - 0.0243108868598938 + * - Third iteratation (replacement of second iteration stochastic vector) + - 0.031805530190467834 + +The timing for the single-node Dragon runtime is: + +.. list-table:: Single-Node Dragon Timings for Parallel Random State + :widths: 25 25 + :header-rows: 1 + + * - Process step + - Time in seconds + * - First iteratation (generation of stochastic vector) + - 2.8984111174941063 + * - Second iteratation (replacement of stochastic vector) + - 3.1529479399323463 + * - Third iteratation (replacement of second iteration stochastic vector) + - 3.170066222548485 + +The timing for the multi-node Dragon runtime is: + +.. list-table:: Multi-node Dragon Timings for Parallel Random State + :widths: 25 25 + :header-rows: 1 + + * - Process step + - Time in seconds + * - First iteratation (generation of stochastic vector) + - 3.2446429850533605 + * - Second iteratation (replacement of stochastic vector) + - 3.3172717401757836 + * - Third iteratation (replacement of second iteration stochastic vector) + - 3.0256078988313675 + + +.. literalinclude:: ../../examples/multiprocessing/joblib/serialization_and_wrappers.py + + The timing for the base multiprocessing runtime is: + +.. list-table:: Base Multiprocessing Timings for Serialization and Wrappers + :widths: 25 25 + :header-rows: 1 + + * - Serialization Type + - Time in seconds + * - With loky backend and cloudpickle serialization + - 0.085 + * - With multiprocessing backend and pickle serialization + - 0.093 + * - With pickle serialization + - 0.080 + +The timing for the single-node Dragon runtime is: + +.. list-table:: Single-node Dragon Timings for Serialization and Wrappers + :widths: 25 25 + :header-rows: 1 + + * - Serialization Type + - Time in seconds + * - With loky backend and cloudpickle serialization + - 3.147 + * - With multiprocessing backend and pickle serialization + - 3.127 + * - With pickle serialization + - 2.653 + +The timing for the multi-node Dragon runtime is: + +.. list-table:: Multi-node Dragon Timings for Serialization and Wrappers + :widths: 25 25 + :header-rows: 1 + + * - Serialization Type + - Time in seconds + * - With loky backend and cloudpickle serialization + - 3.343 + * - With multiprocessing backend and pickle serialization + - 2.976 + * - With pickle serialization + - 3.581 \ No newline at end of file diff --git a/doc/cbook/torch-scipy-telemetry.rst b/doc/cbook/torch-scipy-telemetry.rst index 93aeaa2..9a24615 100644 --- a/doc/cbook/torch-scipy-telemetry.rst +++ b/doc/cbook/torch-scipy-telemetry.rst @@ -20,14 +20,15 @@ The example consists of four components: We start a pool of workers for the mnist computation, a different pool of workers for the SciPy computation, as many monitor processes as the number of nodes that Dragon uses (it could be a subset of the node allocation) and a single post-processing process. All the workers are distributed across the available nodes. -Figure 1 presents the structure of a toy example with 4 compute nodes and shows the basic architecture and process placement. The shared queue +:numref:`structure-of-demo` presents the structure of a toy example with 4 compute nodes and shows the basic architecture and process placement. The shared queue lives on the same node as the process that created it. In our example, the head/main process creates the queue. The user main program and the head/main process live on compute node 1. .. figure:: images/telemetry_deployment_diagram.jpg :scale: 30% + :name: structure-of-demo - **Figure 1: Structure of the multi-node process orchestration and node telemetry demo on an allocation of 4 compute nodes** + **Structure of the multi-node process orchestration and node telemetry demo on an allocation of 4 compute nodes** This example consists of the following python files: diff --git a/doc/components/broadcast.rst b/doc/components/broadcast.rst index 20a99c0..c37af98 100644 --- a/doc/components/broadcast.rst +++ b/doc/components/broadcast.rst @@ -13,8 +13,9 @@ call to a trigger function. The payload is optional. The BCast object provides a synchronization/communication structure. .. figure:: images/bcast.png + :name: bcast-any-to-many - **Figure 1: An Any to Many Broadcast Synchronization Object** + **An Any to Many Broadcast Synchronization Object** A BCast object is meant to be shared by multiple threads/processes. The object is first created by a process. Then a serialized descriptor to it can be shared with other processes. Via this serialized descriptor other @@ -48,10 +49,11 @@ Triggering processes may trigger one or all processes that are waiting on a BCas .. figure:: images/bcastflow.srms1.png :scale: 75% + :name: ops-on-bcast - **Figure 2: Operations on a BCast Object** + **Operations on a BCast Object** -The flow diagram in Figure 2 shows an interaction with a BCast object and points out a few features/semantics of these +The flow diagram in :numref:`ops-on-bcast` shows an interaction with a BCast object and points out a few features/semantics of these synchronization/communication objects. The flow of interaction proceeds as follows: #. The process T1 creates the BCast object and through some means, communicates its location to all the other @@ -317,7 +319,7 @@ This creates a BCast object. FIXME - This example is not complete. Placeholder o .. code-block:: C :linenos: - :caption: **Figure 1: A BCast Example** + :caption: **A BCast Example** #include #include diff --git a/doc/components/managed_memory/bitset.rst b/doc/components/managed_memory/bitset.rst index ce69305..ab4926d 100644 --- a/doc/components/managed_memory/bitset.rst +++ b/doc/components/managed_memory/bitset.rst @@ -17,7 +17,8 @@ the functions. .. code-block:: C :linenos: - :caption: **Figure 1: A BitSet Example** + :caption: **A BitSet Example** + :name: bitset-example size_t bitsetsize; dragonBitSetErr_t brc; @@ -55,13 +56,14 @@ the functions. dragon_bitset_zeroes_to_right(&bset,0,&val); printf("The number is %lu\n",val); -The output from the example program in figure 1 is given in figure 2. The bits in the bitset display bit 0 on +The output from the example program in :numref:`bitset-example` is given in :numref:`bitset-example-output`. The bits in the bitset display bit 0 on the left, not the right. In this way, the bits display lexicographically in a dump from left to right for easier reading. Bit 0 is the left-most bit in the dump while the last bit is lexicographically the last bit to be displayed. .. code-block:: text - :caption: **Figure 2: BitSet Example Output** + :caption: **BitSet Example Output** + :name: bitset-example-output That was a one A Bit Dump @@ -85,10 +87,11 @@ the first 8 bytes, followed by enough bytes to hold the rest of the bits. Note t always use a multiple of 8 bytes for easy alignment with other data. When a BitSet is initialized, a handle to the BitSet is also initialized. The handle is now the user of this -API accesses the BitSet. The handle structure is given in figure 3. +API accesses the BitSet. The handle structure is given in :numref:`bitset-handle-def`. .. code-block:: C - :caption: **Figure 3: BitSet Handle Definition** + :caption: **BitSet Handle Definition** + :name: bitset-handle-def typedef struct dragonBitSet_st { size_t size; diff --git a/doc/components/managed_memory/heapmanager.rst b/doc/components/managed_memory/heapmanager.rst index 531dda4..1d150e3 100644 --- a/doc/components/managed_memory/heapmanager.rst +++ b/doc/components/managed_memory/heapmanager.rst @@ -21,18 +21,19 @@ An Example of Malloc and Free ============================= .. figure:: images/heapallocations.png + :name: heap-allocations - **Figure 1: A Sample Heap with Allocations** + **A Sample Heap with Allocations** Consider a 1K heap with a minimum block size of 32 bytes. The smallest allocatable block size is called a -segment. The 1K heap is made up of 32 segments, each of 32 bytes each. Figure 1 shows a heap with allocations +segment. The 1K heap is made up of 32 segments, each of 32 bytes each. :numref:`heap-allocations` shows a heap with allocations in colors. The first allocation was for 16 bytes, which resulted in a 32 byte allocation (the smallest possible size for this heap) and was allocated to segment 0. The second allocation was for 500 bytes (actually 512 bytes) which resulted in the allocation of segments 16-31. Then came an allocation of 64 bytes which went into segments 2 and 3. The allocation of segments 8-15 was for a request of 222 bytes but allocated 256 bytes since that is the nearest power of 2. Finally, the purple allocation from segments 4-7 resulted from a request of 112 bytes but again resulted in an allocation of 4 segments and a size of 128 bytes. So the mallocs that -lead to the allocations in figure 1 might be as follows. +lead to the allocations in :numref:`heap-allocations` might be as follows. * 32 bytes * 512 bytes @@ -46,14 +47,15 @@ powers. Each split operation is O(1). In this example, the maximum number of spl there were 5 splits required on the first allocation. The second allocation required 0 splits. The third required two splits. The fourth 0 splits. The fifth and final allocation required 0 splits. -A heap with this maximum block size and minimum block size is initialized as shown in figure 2. Since the heap +A heap with this maximum block size and minimum block size is initialized as shown in :numref:`heap-init`. Since the heap manager always manages blocks sizes of powers of 2, a heap is initialized by providing the maximum and minimum -block size powers. In figure 2 the 10 is the 1024 byte maximum block size and 5 is the 32 byte minimum block +block size powers. In :numref:`heap-init` the 10 is the 1024 byte maximum block size and 5 is the 32 byte minimum block size. .. code-block:: C :linenos: - :caption: **Figure 2: Heap Initialization** + :caption: **Heap Initialization** + :name: heap-init // make a heap of size 1K with 32 byte segments as minimum block size. How much space // is required? This call determines how much space is required for a heap with @@ -83,7 +85,7 @@ size. As blocks are freed, they are joined together into large free blocks if the block and its buddy are free. The buddy of a block is a block whose address differs from its address by a power of 2 size. For instance, segment -0 in the allocation of figure 1 has segment 1 as its buddy because they are at index 0 and index 1 of the list +0 in the allocation of :numref:`heap-allocations` has segment 1 as its buddy because they are at index 0 and index 1 of the list of segments. The block starting at segment 2 has its buddy starting at index 0, but since index 0 is currently split, the buddy of the green block is not available for joining to it once it is freed. To illustrate this joining of blocks, consider the following sequence of free requests. @@ -97,8 +99,9 @@ The algorithm doesn't consider anything further, but because segment 0 is in a b block is part of a block of 64 bytes, they could not be joined either (at this point anyway). .. figure:: images/heapfree1.png + :name: heap-free-green - **Figure 3: After Freeing the Green Block** + **After Freeing the Green Block** Freeing the Purple Block Starting at Segment 4 ---------------------------------------------- @@ -109,8 +112,9 @@ are three free blocks that are available in the heap. The segment 1 is a 32 byte make up a 64 byte free block. Finally, the segments 4-6 make up a 128 byte free block. .. figure:: images/heapfree2.png + :name: heap-free-purple - **Figure 4: After Freeing the Purple Block** + **After Freeing the Purple Block** Freeing the Yellow Block Starting at Segment 16 ----------------------------------------------- @@ -119,8 +123,9 @@ The 512 byte block starting at segment 16 is freed next and results in once agai segment 0. Again, segment 0 is not free and no further joining of blocks is possible. .. figure:: images/heapfree3.png + :name: heap-free-yellow - **Figure 5: After Freeing the Yellow Block** + **After Freeing the Yellow Block** Freeing the Orange Block Starting at Segment 0 ---------------------------------------------- @@ -137,8 +142,9 @@ At this point there are two free blocks: a 256 byte block starting at segment 0 at segment 16. .. figure:: images/heapfree4.png + :name: heap-free-orange - **Figure 6: After Freeing the Orange Block** + **After Freeing the Orange Block** Freeing the Maroon Block Starting at Segment 8 ---------------------------------------------- @@ -149,8 +155,9 @@ but since its buddy is also free and the same size, the two 512 byte blocks are block. .. figure:: images/heapfree5.png + :name: heap-free-maroon - **Figure 7: After Freeing the Maroon Block** + **After Freeing the Maroon Block** Meta-Data and Handles ===================== @@ -161,8 +168,9 @@ two bit sets, the block set and the free set. There is also a lock associated wi multi-processing compatible. .. figure:: images/metadata.png + :name: metadata - **Figure 8: Meta-Data and Handle Structure** + **Meta-Data and Handle Structure** In the meta-data the *Segments Ptr* is necessary because there might be padding between the *Free Lists* and the beginning of the segments, depending on the requested alignment of the segments when the heap is @@ -173,7 +181,7 @@ re-compute the starting segments address. .. code-block:: C :linenos: - :caption: **Figure 9: C Handle Definition** + :caption: **C Handle Definition** typedef void dragonDynHeapSegment_t; diff --git a/doc/components/managed_memory/hexdump.rst b/doc/components/managed_memory/hexdump.rst index 29ccf19..1dc9a2e 100644 --- a/doc/components/managed_memory/hexdump.rst +++ b/doc/components/managed_memory/hexdump.rst @@ -20,17 +20,19 @@ Hex Dump has an easy to use interface. You provide a pointer and a length to it an indentation string. .. code-block:: C - :caption: **Figure 1: Hex Dump Example Code** + :caption: **Hex Dump Example Code** + :name: hex-dump-example-code hex_dump_to_fd(fd, "BITS",(void*)set->data,num_bytes,indent); -When invoked, The output looks something like that found in figure 2. Note how lines with 0's are suppressed. +When invoked, The output looks something like that found in :numref:`hex-dump-example-output`. Note how lines with 0's are suppressed. The *fd* is a file, which includes the possibility of using *stdout* or *stderr*. A title comes second followed by a pointer to the data and the number of bytes to dump. Finally, the *indent* is a null-terminated string to print before each line of the dump. .. code-block:: text - :caption: **Figure 2: Hex Dump Sample Output** + :caption: **Hex Dump Sample Output** + :name: hex-dump-example-output * BITS: * 00007FCF60C97070 80 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ diff --git a/doc/components/managed_memory/managed_memory.rst b/doc/components/managed_memory/managed_memory.rst index 6118801..b94905a 100644 --- a/doc/components/managed_memory/managed_memory.rst +++ b/doc/components/managed_memory/managed_memory.rst @@ -31,8 +31,9 @@ Architecture ============ .. figure:: images/managed_memory.svg + :name: managed-mem-arch - **Figure 1: Architecture of the Managed Memory component** + **Architecture of the Managed Memory component** **MISSING Description of Request Handler and Pools** diff --git a/doc/components/scalable_locks/scalable_locks.rst.needs_work b/doc/components/scalable_locks/scalable_locks.rst.needs_work index 04ba6d8..cc80e88 100644 --- a/doc/components/scalable_locks/scalable_locks.rst.needs_work +++ b/doc/components/scalable_locks/scalable_locks.rst.needs_work @@ -21,8 +21,9 @@ Architecture ============ .. figure:: images/scalable_locks.svg + :name: scalable-locks - **Figure 1: Architecture of the Scalable Locks component** + **Architecture of the Scalable Locks component** Dragons scalable locks implementation provides access to @@ -251,14 +252,14 @@ Functions Attach to the FIFO-style lock previously mapped into the memory pointed to by *prt* and return a :c:type:`dragonFIFOLock_t` handle to the lock. - + Returns ``DRAGON_SUCCESS`` or an error code. .. c:function:: dragonError_t dragon_greedy_lock_attach(dragonGreedyLock_t * dlock, void * ptr) Attach to the greedy-style lock previously mapped into the memory pointed to by *prt* and return a :c:type:`dragonGreedyLock_t` handle to the lock. - + Returns ``DRAGON_SUCCESS`` or an error code. .. c:function:: dragonError_t dragon_fifo_lock_detach(dragonFIFOLock_t * dlock) @@ -434,12 +435,12 @@ Operational Functions Waits for a write lock on the *RWLock* to become available. This will occur when all readers have released their locks by calling *dragon_rwlock_read_unlock*. Once the write lock is acquired, all readers wait until the write lock is released by calling *dragon_rwlock_write_unlock*. - + Returns DRAGON_SUCCESS, TBD. - + .. c:function:: dragonError_t dragon_rwlock_write_unlock(dragonRWLock_t * dlock) Releases the acquired write lock on the *RWLock* referred to by *dlock*. Any waiting readers or writers will be able to proceed. No guarantee is made for which will proceed first. - + Returns DRAGON_SUCCESS, TBD. \ No newline at end of file diff --git a/doc/conf.py b/doc/conf.py index e8794e7..286f304 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -24,8 +24,8 @@ # -- Project information ----------------------------------------------------- project = "Dragon" -DragonVersion = "0.61" -copyright = "2023, Hewlett Packard Enterprise" +DragonVersion = "0.8" +copyright = "2024, Hewlett Packard Enterprise" author = "Michael Burke, Eric Cozzi, Zach Crisler, Julius Donnert, Veena Ghorakavi, Nick Hill, Maria Kalantzi, Ben Keen, Kent D. Lee, Pete Mendygral, Davin Potts and Nick Radcliffe" # -- General configuration --------------------------------------------------- diff --git a/doc/infrastructure/architecture.rst b/doc/infrastructure/architecture.rst index 4614ccd..03211a0 100644 --- a/doc/infrastructure/architecture.rst +++ b/doc/infrastructure/architecture.rst @@ -14,12 +14,13 @@ not work in a distributed system. While very portable across time and operating descriptor based approach doesn't offer the best performance and scalability. .. figure:: images/infrastructure.svg + :name: infra-schematic - **Figure 1: Dragon Runtime Architecture in a multi-node deployment** + **Dragon Runtime Architecture in a multi-node deployment** .. The rest of this section describes this component diagram ... -There are various actors involved in the Dragon runtime, they are shown in figure 1. Although they will be +There are various actors involved in the Dragon runtime, they are shown in :numref:`infra-schematic`. Although they will be specified in more detail in other documents, we list them here and summarize their function. Whenever necessary, the user program is called ``my.py`` and is started from the command line by invoking the :ref:`Launcher` with ``dragon my.py``. In particular, no preliminary interaction with the system's workload diff --git a/doc/infrastructure/images/overlay_network_fanout.puml b/doc/infrastructure/images/overlay_network_fanout.puml new file mode 100644 index 0000000..9edd02a --- /dev/null +++ b/doc/infrastructure/images/overlay_network_fanout.puml @@ -0,0 +1,38 @@ +@startmindmap + +* Front End +** Node 0 +*** Node 32 +**** 1056 +**** ... +**** 1087 +*** ... +**** ... +*** Node 63 +**** Node 2048 +**** ... +**** Node 2079 +** ... +*** ... +**** ... +** Node 8 +*** Node 288 +**** Node 9248 +**** ... +**** Node 9279 +*** ... +**** ... +*** Node 311 +**** Node 9984 +**** ... +**** Node 9999 +*** ... +*** Node 319 +** ... +*** ... +** Node 31 +*** Node 1024 +*** ... +*** 1055 + +@endmindmap \ No newline at end of file diff --git a/doc/infrastructure/infrastructure.rst b/doc/infrastructure/infrastructure.rst index 109f949..bd8d77e 100644 --- a/doc/infrastructure/infrastructure.rst +++ b/doc/infrastructure/infrastructure.rst @@ -13,4 +13,6 @@ Infrastructure single_node_deployment.rst multi_node_deployment.rst bootstrapping.rst - logging.rst \ No newline at end of file + logging.rst + policy.rst + overlay_network.rst diff --git a/doc/infrastructure/messages_api.rst b/doc/infrastructure/messages_api.rst index f7802ac..215febe 100644 --- a/doc/infrastructure/messages_api.rst +++ b/doc/infrastructure/messages_api.rst @@ -3319,27 +3319,15 @@ value 0 in the single node case. channels there are. *fields* - **ip_addr** - - string - - the ip address of the node sending this message. - - **host_name** - - string - - the hostname of the node sending this message. - - **host_id** - - integer - - the hostid of the node sending this message. + **node_desc** + - ``Python` - + :class:`Python` .. _shpinggs: @@ -3889,9 +3877,9 @@ These messages go to the :ref:`Launcher` frontend in standard and server mode vi *fields* **nodes_desc** - dictionary with keys corresponding to string node indices and values of class - ``dragon.launcher.node_desc._NodeDescriptor``(eg: The hostname info for node 5 is accessed via + ``dragon.infrastructure.node_desc.NodeDescriptor``(eg: The hostname info for node 5 is accessed via ``la_channels_msg.nodes_desc['4'].host_name`` ) - - Attributed for each value of ``dragon.launcher.node_desc._NodeDescriptor``: + - Excamples attributes for each value of ``dragon.infrastructure.node_desc.NodeDescriptor``: - 'host_name': hostname string - 'host_id': integer which is determined by calling the Posix gethostid function @@ -3907,7 +3895,7 @@ These messages go to the :ref:`Launcher` frontend in standard and server mode vi - The number of gateway channels per node *implementation(s):* :func:`Python` - :class:`Python` + :class:`Python` 94. **Breakpoint** diff --git a/doc/infrastructure/multi_node_deployment.rst b/doc/infrastructure/multi_node_deployment.rst index 8d31a60..26f0246 100644 --- a/doc/infrastructure/multi_node_deployment.rst +++ b/doc/infrastructure/multi_node_deployment.rst @@ -11,17 +11,19 @@ command prompt on. This frontend can be co-located with backend nodes or run from its own node. All off-node communication that is initiated by the :ref:`LocalServices` goes through :ref:`GlobalServices`. Local Services itself has a one-node view of the world while Global Services does the -work of communicating off node when necessary. Figures 1 and 2 depict a +work of communicating off node when necessary. :numref:`deploy-multi-node` and :numref:`multi-node-overview` depict a multi-node version of the Dragon :ref:`Services`. .. figure:: images/deployment_multi_node.svg + :name: deploy-multi-node - **Figure 1: Startup Overview** + **Startup Overview** .. figure:: images/multinodeoverview.png :scale: 30% + :name: multi-node-overview - **Figure 2: Multi-Node Overview of Dragon Services** + **Multi-Node Overview of Dragon Services** .. _MultiNodeBringup: @@ -117,7 +119,7 @@ Since all :ref:`Services` run as user-level services (i.e. not with superuser authority), the services described here are assumed to be one per launched user program. -The multi-node bring-up sequence is given in figure 3 and in the section titled +The multi-node bring-up sequence is given in :numref:`startup-seq-multinode` and in the section titled :ref:`MultiNodeBringup` where the message descriptions are also provided. The Launcher Frontend brings up an instance of the Launcher Backend on each node. Each launcher (frontend and backend) then brings up an instance of the TCP @@ -132,12 +134,12 @@ output from the user program to the Frontend through the Backend. Sequence diagram ------------------- -Figure 3 depicts the message flow in the multi-node startup sequence. +The diagram below depicts the message flow in the multi-node startup sequence. .. raw:: html :file: images/startup_seq_multi_node.svg -**Figure 3: Sequence diagram of Dragon multi-node bringup** +**Sequence diagram of Dragon multi-node bringup** Notes on Bring-up Sequence ^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -338,7 +340,7 @@ Sequence diagram .. raw:: html :file: images/teardown_seq_multi_node.svg -**Figure 7: Multi-Node Teardown Sequence** +**Multi-Node Teardown Sequence** Notes on Teardown Sequence ^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/infrastructure/overlay_network.rst b/doc/infrastructure/overlay_network.rst new file mode 100644 index 0000000..b2a6962 --- /dev/null +++ b/doc/infrastructure/overlay_network.rst @@ -0,0 +1,18 @@ +.. _Overlay_Network: + +Dragon Overlay Network +++++++++++++++++++++++ + +In the multi-node case, Dragon establishes an Overlay Network to communicate between the Dragon Launcher FrontEnd and +the Dragon Launcher Backend processes running on each backend compute node. To establish this Overlay Network, Dragon uses +the Dragon TCP Network Agent in a Fanout Tree. The Fanout Tree uses a default branching factor of 32 nodes. The hierarchy +of the tree follows the order of nodes in the node list with the root being the front end and children being located at +bn+1 to bn+b where b is the branching factor (32) of the tree. This enables Dragon to scale and communicate with a large +number of nodes efficiently. + +:numref:`overlay-network-fanout` shows an example of a 10,000 node Dragon Overlay Network Fanout. + +.. figure:: images/overlay_network_fanout.svg + :name: overlay-network-fanout + + **Example 10,000 node Overlay Network fanout** \ No newline at end of file diff --git a/doc/infrastructure/processes.rst b/doc/infrastructure/processes.rst index d347a5e..335133a 100644 --- a/doc/infrastructure/processes.rst +++ b/doc/infrastructure/processes.rst @@ -185,13 +185,14 @@ Activity Diagram Following is a flow diagram showing the interaction between components during process launch, output and input handling, signaling, and process termination. The text below the figure gives additional details on the -activities during this interaction. The *a1* through *a6* are denoted in figure 1 and the numbered list below +activities during this interaction. The *a1* through *a6* are denoted in :numref:`launchproc` and the numbered list below further describes those activities. .. figure:: images/launchproc.srms1.png :scale: 75% + :name: launchproc - **Figure 1: Launcher Component Interaction during Process Interaction** + **Launcher Component Interaction during Process Interaction** Activity Description diff --git a/doc/infrastructure/single_node_deployment.rst b/doc/infrastructure/single_node_deployment.rst index a636f03..f384b80 100644 --- a/doc/infrastructure/single_node_deployment.rst +++ b/doc/infrastructure/single_node_deployment.rst @@ -13,25 +13,27 @@ user application decides to directly spawn processes itself, it retains the resp resources they use. .. figure:: images/deployment_single_node.svg + :name: deploy-single-node - **Figure 1: Deployment diagram a single node** + **Deployment diagram a single node** .. figure:: images/singlenodeoverview.png + :name: singlenode-overview - **Figure 2: Single-Node Overview of Dragon Services** + **Single-Node Overview of Dragon Services** .. FIXME: NOTE: In the single-node case the :ref:`Launcher` serves as both frontned and backend component. So the launcher sends and receives several backend messages during bringup and teardown. **FIXME**: Adapt UML Diagram to be correct -In the single-node case, as depicted in figures 1 + 2, there is no :ref:`TransportAgent`, :ref:`MRNet` tree, -or :ref:`Launcher` backend service. :ref:`Channels` in figure 2 are represented by the colored arrows. The +In the single-node case, as depicted in :numref:`deploy-single-node` + :numref:`singlenode-overview`, there is no :ref:`TransportAgent`, :ref:`MRNet` tree, +or :ref:`Launcher` backend service. :ref:`Channels` in :numref:`singlenode-overview` are represented by the colored arrows. The :ref:`Launcher` steps into the place of the Backend and the Shepherd communicates directly with the Launcher instead of going through the Backend and the :ref:`MRNet` tree. The :ref:`TransportAgent` is not started since there is no off-node communication in this case. However, :ref:`LocalServices` and :ref:`GlobalServices` still are present to provide the same level of service that is present in the multi-node case. While the bringup and teardown of the Dragon :ref:`Services` is significantly different in the single-node and -multi-node cases, from figures 1 and 2 the overall structure is similar. +multi-node cases, from :numref:`deploy-single-node` and :numref:`singlenode-overview` the overall structure is similar. .. _SingleNodeBringup: @@ -39,10 +41,11 @@ Single Node Bringup =================== .. figure:: images/startup_seq_single_node.svg + :name: startup-seq-single-node - **Figure 3: Startup Sequence on a single node** + **Startup Sequence on a single node** -The bringup of the Dragon run-time services is detailed in figure 3 and below, where also message descriptions +The bringup of the Dragon run-time services is detailed in :numref:`startup-seq-single-node` and below, where also message descriptions are given. During single node bringup the Shepherd is started by the Launcher and a pipe is used to provide the initial @@ -645,7 +648,7 @@ infrastructure starting with activity 5 and message 4 in the diagram below. Transaction diagram ------------------- -Figure 4 depicts the normal single node teardown sequence and is also included +:numref:`teardown-seq-single-node` depicts the normal single node teardown sequence and is also included in :ref:`SingleNodeTeardown` where message defintions are given in more detail. The tear down is initiated by Global Services. The Shepherd shuts down as a result of the *SHTakedown* message sent from the launcher but the sequence @@ -658,8 +661,9 @@ the teardown of the Dragon Services. .. figure:: images/single_teardown.srms1.png :scale: 75% + :name: teardown-seq-single-node - **Figure 4: Single-Node Teardown Sequence** + **Single-Node Teardown Sequence** Activities ^^^^^^^^^^ diff --git a/doc/pguide/owner.rst b/doc/pguide/owner.rst index a627436..f4ac91f 100644 --- a/doc/pguide/owner.rst +++ b/doc/pguide/owner.rst @@ -39,10 +39,11 @@ Round Robin Placement .. figure:: images/roundrobin.svg :scale: 75% + :name: roundrobin - **Figure 1: UML deployment diagram of round robin placement with 4 processes on a distributed system with 3 compute nodes and one login node** + **UML deployment diagram of round robin placement with 4 processes on a distributed system with 3 compute nodes and one login node** -Figure 1 shows how processes will be placed across nodes in your allocation with the current round-robin placement +:numref:`roundrobin` shows how processes will be placed across nodes in your allocation with the current round-robin placement policy. Any two processes started consecutively will be placed on unique nodes unless there is only a single node within your Slurm allocation. diff --git a/doc/pguide/stack.rst b/doc/pguide/stack.rst index 7d5d946..72f120e 100644 --- a/doc/pguide/stack.rst +++ b/doc/pguide/stack.rst @@ -2,31 +2,32 @@ The API Stack +++++++++++++ Before you start programming with Dragon, you need to decide which API you want -to program to. The runtime provides a stack of interfaces abstracting +to program to. The runtime provides a stack of interfaces abstracting :term:`resources ` of a distributed system, ranging from low-level shared memory to a distributed dictionary. It is *composable*, meaning the APIs are -built on top of each other (see figure 1). We deliberately expose the whole +built on top of each other (see :numref:`dragon-api-stack`). We deliberately expose the whole stack of APIs so that you can choose if you want to interact with the complete -runtime or want to use only parts of it. For an overview, see table 1 in the +runtime or want to use only parts of it. For an overview, see table 1 in the :ref:`reference section `. .. figure:: images/dragon_api_stack.png :scale: 25% + :name: dragon-api-stack - **Figure 1: The Dragon API stack** + **The Dragon API stack** Lower level interfaces yield less convenient objects. Thus new and experienced users should consider programming to Dragon in two ways: -1. In Python using :ref:`pguide/dragon_multiprocessing:Multiprocessing with Dragon`, if they +1. In Python using :ref:`pguide/dragon_multiprocessing:Multiprocessing with Dragon`, if they want to make an existing Python code scale to a distributed system quickly. -2. In C, C++, Fortran and Python using the :ref:`ref/native/index:Dragon Native` API, if they want to take advantage +2. In C, C++, Fortran and Python using the :ref:`ref/native/index:Dragon Native` API, if they want to take advantage of all Dragon features or need to use languages other than Python. The objects provided by these two APIs have the following properties: -* **interoperable**: a named Python Multiprocessing Queue object can be used as a managed Dragon Native Queue in C with the same name. +* **interoperable**: a named Python Multiprocessing Queue object can be used as a managed Dragon Native Queue in C with the same name. * **transparent**: :term:`objects ` can be used everywhere on a system of distributed or even federated nodes. * **shareable**: objects can be serialized and passed to other programs, processes or threads via stdin. * **managed**: :term:`objects ` can be looked up by :term:`name ` or :term:`uid ` to retrieve their :term:`serialized descriptor `. @@ -39,7 +40,7 @@ programming languages. In the future, experienced developers can further program to the -3. :term:`Unmanaged ` Dragon Native API, if they want to use composite objects with improved performance. See :ref:`uguide/resource_model:Performance Costs`. +3. :term:`Unmanaged ` Dragon Native API, if they want to use composite objects with improved performance. See :ref:`uguide/resource_model:Performance Costs`. 4. Dragon Client API or Dragon Infrastructure API, if they want to extend the functionality of the Dragon runtime by extending Dragon Services. 5. Dragon Core API, to use core functionality in their own programs without starting the runtime. To use the Dragon core API on its own, see also :ref:`pguide/dragon_multiprocessing:Multiprocessing and Dragon without Patching` @@ -50,10 +51,11 @@ Architecture of a Dragon Program .. figure:: images/api_use_python.svg :scale: 75% - - **Figure 2: Architecture of a user program using Dragon with Python Multiprocessing or Dragon Native. Internal Dragon APIs are not shown.** + :name: api-use-python -In figure 2 we show a component diagram of the architecture of a Dragon program + **Architecture of a user program using Dragon with Python Multiprocessing or Dragon Native. Internal Dragon APIs are not shown.** + +In :numref:`api-use-python` we show a component diagram of the architecture of a Dragon program using either the Python Multiprocessing with Dragon API, or the Dragon Native API. @@ -63,7 +65,7 @@ API. Dragon services manage the primary objects and communicate using the infrastructure message component on top of the core API. -* Python Multiprocessing with Dragon programs only use the Multiprocessing API. Our +* Python Multiprocessing with Dragon programs only use the Multiprocessing API. Our MPBridge component translates the Multiprocessing objects into Dragon native objects by heavily modifying the object APIs. This way we achieve limited interoperability between both APIs. @@ -73,18 +75,19 @@ Architecture of Advanced Use Cases .. figure:: images/api_use_core.svg :scale: 75% + :name: api-use-core - **Figure 3: Architecture of advanced use cases for the Dragon runtime. Internal APIs are not shown.** + **Architecture of advanced use cases for the Dragon runtime. Internal APIs are not shown.** -In figure 3 we show a component diagram of the architecture of advanced use cases for Dragon. +In :numref:`api-use-core` we show a component diagram of the architecture of advanced use cases for Dragon. Note that these use cases are not supported yet. * User programs using :term:`unmanaged ` Dragon native objects directly call into - Dragon, but do not require the infrastructure services to track :term:`names ` and + Dragon, but do not require the infrastructure services to track :term:`names ` and :term:`uids ` of their objects. This reduces the load on infrastructure services, which only provide :term:`transparency ` across :term:`distributed or federated systems `. -* Users may choose to extend the Dragon native API with their own composite objects, using Dragons +* Users may choose to extend the Dragon native API with their own composite objects, using Dragons native, client and core APIs for maximum flexibility. * User may want to use only the Dragon core components to extend their own programs with its components. In that case the infrastructure components of Dragon do not need to be started, Dragon core components can be - :ref:`directly imported and used `. \ No newline at end of file + :ref:`directly imported and used `. \ No newline at end of file diff --git a/doc/ref/client/index.rst b/doc/ref/client/index.rst index 9f72d5d..481dac6 100644 --- a/doc/ref/client/index.rst +++ b/doc/ref/client/index.rst @@ -33,10 +33,11 @@ Architecture .. figure:: images/client_architecture.svg :scale: 75% + :name: GS-client-architecture - **Figure 1: GS Client architecture** + **GS Client architecture** -Figure 1 shows the architecture of GS Client API. It exposes four base components to the user: +:numref:`GS-client-architecture` shows the architecture of GS Client API. It exposes four base components to the user: 1. **Process**: An interface to a managed process. 2. **Pool/Shared Memory**: An interface to a managed memory pool or allocation. diff --git a/doc/ref/data/index.rst b/doc/ref/data/index.rst new file mode 100644 index 0000000..51d9196 --- /dev/null +++ b/doc/ref/data/index.rst @@ -0,0 +1,15 @@ +Data +++++++++ + +Python Reference +================ +.. currentmodule:: dragon.data.distdictionary + +.. autosummary:: + :toctree: + :recursive: + + dict_managers + distributed_dict + dragon_dict + diff --git a/doc/ref/inf/index.rst b/doc/ref/inf/index.rst index efc77f3..4d6c5f7 100644 --- a/doc/ref/inf/index.rst +++ b/doc/ref/inf/index.rst @@ -28,6 +28,7 @@ Python Components messages node_desc parameters + policy pool_desc process_desc standalone_conn @@ -48,10 +49,11 @@ Architecture .. figure:: images/infrastructure_architecture.svg :scale: 75% + :name: dragon-inf-api-architecture - **Figure 1: Architecture of the Dragon Infrastructure API** + **Architecture of the Dragon Infrastructure API** -Figure 1 shows a UML2 component diagram of the Dragon infrastructure API and its components. +:numref:`dragon-inf-api-architecture` shows a UML2 component diagram of the Dragon infrastructure API and its components. The infrastructure API is consumed by Dragon Services: Local Services, Global Services, Launcher Backend, and the Transport Agents. It consists mostly of conventions, like message types and common IDs. diff --git a/doc/ref/mpbridge/index.rst b/doc/ref/mpbridge/index.rst index 7b3913e..36269fb 100644 --- a/doc/ref/mpbridge/index.rst +++ b/doc/ref/mpbridge/index.rst @@ -1,4 +1,4 @@ -MPBridge +MPBridge ++++++++ The Dragon MPbridge component maps the Python Multiprocessing API onto :ref:`ref/native/index:Dragon Native` @@ -24,8 +24,9 @@ Components .. figure:: images/mpbridge_architecture.svg :scale: 75% + :name: mpbridge-architecture - **Figure 1: MPBridge architecture** + **MPBridge architecture** Designing the MPBridge component, we had the following goals in mind: @@ -53,24 +54,25 @@ Example: The Dragon Queue For example, the size of a queue in Dragon native is `q.size()`, while in Multiprocessing it is `q.qsize()`. We created a private method `q._size()` and have `q.size()` wrap it in Dragon Native. In MPBridge, we then remove the `q.size()` that DragonQueue has inherited from Dragon Native's queue and add `q.qsize()` in -DragonQueue that wraps the same private method. +DragonQueue that wraps the same private method. Next we show a class diagram of Dragons queue implementation and how it is inserted into the Multiprocessing package. .. figure:: images/mpbridge_class_diagram.svg + :name: dragon-mpbridge-queue-impl - **Figure 2: Class diagram of the mpbridge.queue implementation.** + **Class diagram of the mpbridge.queue implementation.** Dragon's native queue implementation resides in ``dragon.native.queue.Queue``. Its public interface is the sum of the public interface of the three Python Multiprocessing Queues: ``Queue``, ``JoinableQueue`` and ``SimpleQueue``. The MPBridge component inherits from ``dragon.native.queue.Queue`` into ``dragon.mpbridge.queues.DragonQueu``, ``dragon.mpbridge.queue.DragonSimpleQueue`` and ``dragon.mpbridge.queue.DragonJoinableQueue``. The public API is modified accordingly, so that it conforms with the -Multiprocessing API. +Multiprocessing API. The MPBridge component also contains 3 functions (``Queue``, ``SimpleQueue`` and ``JoinableQueue``) that return the corresponding -classes. The are called from the ``DragonContext``. +classes. The are called from the ``DragonContext``. Just as in Multiprocessing, the methods below the context are exported during startup into the module API. The context itself is part of a list of contexts held at the top level, containing a context per start method. Setting the start method then means setting -the ``DefaultContext`` equal to one of these contexts. To add our start method to this mechanism, we add an ``AugmentedDefaultContext`` +the ``DefaultContext`` equal to one of these contexts. To add our start method to this mechanism, we add an ``AugmentedDefaultContext`` that adds our start method to the list of possible start methods and overloads the ``set_start_method`` method. diff --git a/doc/ref/native/index.rst b/doc/ref/native/index.rst index bd2bae9..0657c83 100644 --- a/doc/ref/native/index.rst +++ b/doc/ref/native/index.rst @@ -37,8 +37,9 @@ Architecture .. figure:: images/architecture.svg :scale: 75% + :name: dragon-native-architecture - **Figure 1: The Dragon native architecture** + **The Dragon native architecture** Dragon native components use the Dragon Global Services Client API to implement :term:`refcounted `, :term:`managed `, or diff --git a/doc/ref/ref.rst b/doc/ref/ref.rst index d9b67ad..5f7350a 100644 --- a/doc/ref/ref.rst +++ b/doc/ref/ref.rst @@ -11,6 +11,8 @@ API Reference inf/index.rst core/index.rst mpbridge/index.rst + workflows/index.rst + data/index.rst ` message. The following fields are part @@ -151,8 +154,9 @@ all notifications about output on standard output and error, while the Global Se notification of the termination of the process. .. figure:: images/managedservices.png + :name: managedservices - **Figure 9: Managed Process services provided by Local Services** + **Managed Process services provided by Local Services** Initially the managed process is in the *init* state and an AsyncIO *process* task (see :ref:`Task Types `) is created that will run to create the process and move it to the *run* state. Once the task is @@ -186,8 +190,9 @@ The Local Services/Global Services Integration ======================================== .. figure:: images/gsmonitor.png + :name: gsmonitor - **Figure 10: The Global Services Monitor** + **The Global Services Monitor** During startup, Local Services creates :ref:`GlobalServices` like a managed process on the node designated as the *PRIMARY_INDEX* in the Dragon Runtime launch parameters (see :ref:`LaunchParameters`) from the perspective diff --git a/doc/start/start.rst b/doc/start/start.rst index 258a864..1be9820 100644 --- a/doc/start/start.rst +++ b/doc/start/start.rst @@ -15,7 +15,7 @@ Prerequisites You need to have the following software packages installed on your system: -- Python 3.9 (e.g., module load cray-python) +- Python 3.9, 3.10, or 3.11 corresponding to your whl file (e.g., module load cray-python) - GCC 9 or later - Slurm or PBS+PALS (for multi-node Dragon) @@ -30,7 +30,7 @@ Install Dragon Before you can run programs using Dragon, you must set up the run-time for your environment. You must have Python 3.9 installed and it must be in your path somewhere. A common choice is to use a Python virtual environment, which can be initialized -from a base Python 3.9+ with: +for example from a base Python 3.9+ with: .. code-block:: console @@ -44,7 +44,7 @@ are relative to the directory that contains the README.md. .. code-block:: console - pip3 install --force-reinstall dragon-0.61-cp39-cp39-linux_x86_64.whl + pip3 install --force-reinstall dragon-0.8-*.whl * Check and possibly update that `$PATH` is has the location of pip installed console scripts, such as ~/.local/bin if you're not using a virtual environment. @@ -57,7 +57,7 @@ are relative to the directory that contains the README.md. .. code-block:: console - module use [/path to dragon-0.61]/modulefiles + module use [/path to dragon-0.8]/modulefiles module load dragon If you intend to use Dragon on your own Linux VM or an image that you diff --git a/doc/uguide/glossary.rst b/doc/uguide/glossary.rst index d32abe5..610eb69 100644 --- a/doc/uguide/glossary.rst +++ b/doc/uguide/glossary.rst @@ -3,8 +3,9 @@ Glossary .. figure:: images/dragon_domain_model.svg :scale: 75% + :name: dragon-domain-model - **Figure 1: UML diagram of the most important Dragon concepts and their relation. Open arrows are read as "is a", diamond edges as "contains", normal arrows are annotated** + **UML diagram of the most important Dragon concepts and their relation. Open arrows are read as "is a", diamond edges as "contains", normal arrows are annotated** .. glossary:: diff --git a/doc/uguide/intro.rst b/doc/uguide/intro.rst index 346a760..ca9b71b 100644 --- a/doc/uguide/intro.rst +++ b/doc/uguide/intro.rst @@ -16,8 +16,9 @@ independently of where processes and Dragon resources are placed. .. figure:: images/overview_queue_doc.jpg :align: center :scale: 25% + :name: overview-queue-doc - **Figure 1: Dragon Object Location Transparency** + **Dragon Object Location Transparency** Dragon provides synchronization and communication :term:`objects` as well as process management to a parallel distributed program. And it manages @@ -25,7 +26,7 @@ doing this with complete :term:`transparency` to the application. While the placement of data, :term:`Dragon objects`, and processes may be controlled by the application, it is not required. And, the communication between nodes that is required to make all this work is done -automatically by the Dragon run-time services. In figure 1 above you should +automatically by the Dragon run-time services. In :numref:`overview-queue-doc` above you should notice that Process A is running on a different node than the queue it is receiving from. At the same time, both A and B are sharing a queue on-node. Process D is sending to a queue off-node. The :term:`transparency` diff --git a/doc/uguide/resource_model.rst b/doc/uguide/resource_model.rst index 870de9a..7fef41c 100644 --- a/doc/uguide/resource_model.rst +++ b/doc/uguide/resource_model.rst @@ -50,12 +50,13 @@ Object Hierarchy .. figure:: images/dragon_object_hierarchy.png :scale: 15% + :name: dragon-obj-hierarchy - **Figure 1: A representation of the Dragon object hierarchy across Dragon Native API and Client API. Not all derived objects are shown.** + **A representation of the Dragon object hierarchy across Dragon Native API and Client API. Not all derived objects are shown.** All Dragon objects are built from four :term:`primary objects ` on top of the :ref:`Dragon Client API ` that represent fundamental -:term:`resource` abstractions (see figure 1): +:term:`resource` abstractions (see :numref:`dragon-obj-hierarchy`): 1. **Process**: A POSIX process that is tracked by the Dragon run-time services. 2. **Memory pool**: A block of shared memory managed by the Dragon run-time services that programs can allocate from. diff --git a/doc/uguide/running_dragon.rst b/doc/uguide/running_dragon.rst index b72a985..af61f49 100644 --- a/doc/uguide/running_dragon.rst +++ b/doc/uguide/running_dragon.rst @@ -1,38 +1,78 @@ Running Dragon ++++++++++++++ -Launching a Dragon program is done in a similar fashion to starting a program -with a workload manager, eg: `srun`_. In Dragon's case, it is invoked via -`dragon`. Its help and basic usage appears below: +Dragon can be run on either a single node (such as your laptop or other single compute resource) or on a +cluster of many compute resources (multi-node system). In either case, launching a Dragon application is +done in a similar way. + +1. Ensure that the Dragon package has been installed into your Python environment. This is typically done + within a virtual Python environment. See the Dragon :doc:`installation instructions ` on + performing this step. + +2. Ensure that the Dragon module has been loaded. This adds the `dragon` command to your environment. + + .. code-block:: console + + module use [/path to dragon]/modulefiles + module load dragon + +3. Start your dragon application using the `dragon` launcher command, passing any relevant command line + options. + + .. code-block:: console + + dragon [dragon options] [program] [program options] + +The `dragon` launcher's full command help and basic usage appears below: .. _Dragon CLI Options: .. autodocstringonly:: dragon.launcher.launch_selector.main -In the event your experiment goes awry, we provide a helper script, `dragon-cleanup`, to clean up any zombie processes and memory. -The script `dragon-cleanup` is placed in the `[dragon install dir]/bin` and added to the `$PATH` environment variable after loading the Dragon module. +In the event that dragon exits abnormlly, use the helper script `dragon-cleanup` to clean up any +zombie processes and reserved memory. The `dragon-cleanup` script is located in the +`[dragon install dir]/bin` directory and added to the `$PATH` environment variable after +loading the Dragon module. Running Dragon on a Multi-Node System ===================================== -To launch a Dragon program on several compute nodes, a Slurm job allocation obtained via `salloc`_ or `sbatch`_ is required, eg: +To run in multinode mode, Dragon must know what resources are available for its use on the +compute backend. When using a workload manager (WLM) such as Slurm or PBS+Pals, Dragon normally +obtains the list of available backend compute resources automatically from the active WLM +allocation. However, when Dragon is used on a generic cluster without a traditional WLM, +Dragon has no way to automatically ascertain what backend compute resources are available. +In these cases Dragon can be run using a generic SSH launch. + +Dragon supports the following multinode configurations: + +1. :ref:`Running on a cluster or supercomputer that has been configured with a Work Load Manager (WLM), such has Slurm or PBS+Pals.` +2. :ref:`Running on a cluster without any Work Load Manager (WLM) using generic SSH launch.` + +.. _using_a_wlm: + +Running Dragon with a Work Load Manager +--------------------------------------- +To launch a Dragon program on several compute nodes, a Work Load Manager job allocation +obtained via `salloc`_ or `sbatch`_ (Slurm) or `qsub` (PBS+Pals) is required, eg: .. code-block:: bash $ salloc --nodes=2 $ dragon p2p_lat.py --iterations 100 --lg_max_message_size 12 --dragon -If the user attempts to execute on a slurm-enabled system, but without an active allocation, an exception is raised, and the program will not execute: +In the event that Dragon is run outside of an active WLM allocation an exception is +raised, and the program will not execute: .. code-block:: bash + :caption: Dragon exception when no WLM allocation exists: - # No salloc allocation exists: $ dragon p2p_lat.py --iterations 100 --lg_max_message_size 12 --dragon RuntimeError: Executing in a Slurm environment, but with no job allocation. Resubmit as part of an 'salloc' or 'sbatch' execution -To override this default behavior and execute a Dragon program on the same node as your shell, the `--single-node-override` option is available. +To override this default behavior and execute a Dragon program on the same node as your shell, the `--single-node-override / -s` option is available. The Dragon service runtime assumes all nodes in an allocation are to be used unless the `--node-count` option is used. This limits the user program to executing on a smaller subset of nodes, potentially useful for execution of scaling @@ -44,126 +84,119 @@ they may do the following: $ salloc --nodes=4 $ dragon --nodes 2 p2p_lat.py --iterations 100 --lg_max_message_size 12 --dragon +.. _using_ssh_launch: +Running Dragon using generic SSH launch +--------------------------------------- -RDMA-based Transport Agent (HSTA) ---------------------------------- +To use SSH launch, the following configuration options must be provided on the `dragon` +launcher command line: Please see the :ref:`FAQ ` for more information. -TCP-based Transport Agent -------------------------- +1. Select the SSH Workload Manager + The `--wlm ssh / -w ssh` option tells the `dragon` launcher to use generic SSH launch + semantics. -The TCP-based transport agent is the default transport service used for -inter-node communication through Channels. The `--transport tcp` or `--transport -hsta` option can be passed to the launcher to explicitly set the desired -transport. In the open source implementation, the TCP transport is the only -choice (see: :ref:`FAQ ` and :ref:`Launcher options `). The TCP agent is configured to use port 7575 by default. If that port -is blocked, it can be changed with the `--port` argument to `dragon`. If not -specific, 7575 is used:, eg: +2. Select the TCP Transport Agent + The `--transport tcp / -t tcp` option tells the `dragon` launcher to use the Dragon TCP + transport agent when setting up the Dragon backend compute network. This is the default + option with the open source Dragon package. -.. code-block:: bash +3. Provide available backend compute resources + The list of available backend compute resources can be provided to the `dragon` launcher in + one of several ways - # Port 7575 used - $ dragon --nodes 2 p2p_lat.py --iterations 100 --lg_max_message_size 12 --dragon + * :ref:`by providing a list of backend compute resources (either explicitly on the launcher command line or via a file)` or + * :ref:`by providing a Dragon network configuration file` - # Port 7000 used - $ dragon --port 7000 --nodes 2 p2p_lat.py --iterations 100 --lg_max_message_size 12 --dragon +Note: Dragon requires that passwordless SSH is enabled for all backend compute resources. -The TCP transport agent also favors known Cray high-speed interconnect networks by default. This is accomplished via -regex specification of the network's named prefix matchin `ipogif` (Aries) or `hsn` (Slingshot): `r'^(hsn|ipogif)\d+$'`. -To change, for example, to match only `hsn` networks, the `--network-prefix` argument could be used: +.. _hostlist_hostfile: -.. code-block:: bash +Providing a Host List or Host File +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - $ dragon --network-prefix hsn --nodes 2 p2p_lat.py --iterations 100 --lg_max_message_size 12 --dragon +Providing a list of hosts to the `dragon` launcher can be done either by listing them explicitly +on the `dragon` command-line or by providing the `dragon` launcher the name of a newline +seperated text file containing the list of host names. +To provide the available nodes explicitly on the `dragon` command line, specify the available +backend hostnames as a comma-separated list, eg: `--hostlist host_1,host_2,host_3`. -**KNOWN ISSUE**: If a `--network-prefix` argument is given that doesn't actually exist, the Dragon runtime will enter -a hung state. This will be fixed in future releases. For now, a `ctrl+z` and `kill` will be necessary to recover. +.. code-block:: shell + :name: host_list + :caption: **Providing a list of hosts via the command line** -Network Configuration ---------------------- + (_env) root $ dragon -w ssh -t tcp --hostlist host_1,host_2,host_3 [PROG] -We provide a command line tool as part of our runtime to query the network features available on your system. It can be -invoked as follow, and the "name" keywords indicate network names that could potentially be supplied as arguments: +To provide the available nodes via a text file, create a newline separated text file with each +backend node's hostname on a separate line. Pass the name of the text file to the `dragon` +launcher, eg: `--hostfile hosts.txt`. -.. code-block:: bash +.. code-block:: shell + :name: host_file + :caption: **Providing a list of hosts via a text file** - # Executing on an Aries network with the output partially truncated. Notice use - # of srun so the compute node network is queries rather than the login-node's - $ srun -n1 python3 -m dragon-transport-ifaddrs --ip --no-loopback --up --running | jq - - [ - { - "name": "ipogif0", - "flags": [ - "IFF_LOWER_UP", - "IFF_NOARP", - "IFF_RUNNING", - "IFF_UP" - ], - "addr": { - "family": "AF_INET", - "port": 0, - "addr": "10.128.0.3" - }, - "netmask": { - "family": "AF_INET", - "port": 0, - "addr": "255.252.0.0" - } - }, - { - "name": "rsip", - "flags": [ - "IFF_LOWER_UP", - "IFF_NOARP", - "IFF_RUNNING", - "IFF_POINTOPOINT", - "IFF_UP" - ], - "addr": { - "family": "AF_INET", - "port": 0, - "addr": "172.30.48.181" - }, - "netmask": { - "family": "AF_INET", - "port": 0, - "addr": "255.255.255.255" - }, - "dstaddr": { - "family": "AF_INET", - "port": 0, - "addr": "172.30.48.181" - } - } - ] + (_env) root $ cat hosts.txt + host_1 + host_2 + host_3 + (_env) root $ dragon -w ssh -t tcp --hostfile hosts.txt [PROG] -The Dragon launcher needs to know what resources are available for its use . To obtain that information, -the Dragon launcher uses an internal network config tool that is deployed at the beginning of every launch of a Dragon job. +NOTE: You cannot use both `--hostfile` and `--hostlist` on the commandline at the same time. -The launcher frontend must know what resources are available for its use on the -compute backend. To obtain that information for a given set of workload -managers, there is a network config tool in the launcher module. This tool is -exposed for general use. However, if deploying dragon on a supported workload -manager with an active job allocation, the launcher frontend will handle -obtaining the network configuration. It exists in `dragon.launcher.network_config.main`, -but can be invoked directory via `dragon-network-config`. Its help is below: +When passing the list of available backend nodes in either of these ways, the `dragon` launcher +needs to determine basic network configuration settings for each listed node before it can launch +the Dragon user application. This is done by launching a utility application on each listed node +to report the node's IP and other relevant information. Running this utility application slightly +delays the startup of Dragon. To prevent this delay, you can instead generate a Dragon +network-config file as explained below. -.. autodocstringonly:: dragon.launcher.network_config.main +.. _network_config: -If output to file (YAML or JSON are supported), the file can be provided to the launcher frontend at launch. Formatting -of the files appears below: +Providing a Dragon Network-Config File +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code-block:: bash +Dragon provides a utility application to gather and persist relevant network information +from it's backend compute resorces. This utility can be used to generate a persistent YAML +or JSON configuration which, when passed to the `dragon` launcher, provides all +required information about a set of backend compute nodes. + +To generate a network configuration file for a given set of backend compute nodes, run the +`dragon-network-config` tool as shown below: + +.. code-block:: shell + :name: ex_run_network_config + :caption: **Example of how to run the dragon-network-config tool** + + (_env) root $ dragon-network-config -w ssh --hostlist host1,host2,host3,host4 -j + (_env) root $ ls ssh.json + ssh.json + +Once you have a network configuration file, the name of the configuration file can +be passed to the `dragon` launcher to identify the available backend compute resources: + +.. code-block:: shell + :name: host_list + :caption: **Providing a list of hosts via the command line** + + (_env) root $ dragon -w ssh -t tcp --network-config ssh.json [PROG] + +NOTE: Changes to the backend compute node's IP addresses or other relevant network +settings will invalidate the saved network config file. If this happens, please +re-run the `dragon-network-config` tool to collect updated information. + +The `dragon-network-config` help is below: - # Launching Dragon with an input network config - $ dragon --network-config pbs+pals.yaml p2p-lat.py --iterations 100 --lg_max_message_size 12 --dragon +.. autodocstringonly:: dragon.launcher.network_config.main + +Formatting of the network-config file appears below for both JSON and YAML: .. code-block:: YAML + :name: yaml_network_config + :linenos: + :caption: **Example of YAML formatted network configuration file** '0': h_uid: null @@ -188,8 +221,100 @@ of the files appears below: shep_cd: '' state: 4 +.. code-block:: JSON + :name: json_network_config + :linenos: + :caption: **Example of JSON formatted network configuration file** + + { + "0": { + "state": 4, + "h_uid": null, + "name": "nid00004", + "is_primary": true, + "ip_addrs": [ + "10.128.0.5:6565" + ], + "host_id": 18446744071562724608, + "num_cpus": 0, + "physical_mem": 0, + "shep_cd": "" + }, + "1": { + "state": 4, + "h_uid": null, + "name": "nid00005", + "is_primary": false, + "ip_addrs": [ + "10.128.0.6:6565" + ], + "host_id": 18446744071562724864, + "num_cpus": 0, + "physical_mem": 0, + "shep_cd": "" + } + } + +.. _transport_agents: + +Dragon's Transport Agents +========================= + +To facilitate cross node communications when running in a multi-node mode, Dragon provides a couple of +different Transport Agents. + +.. _hsta_transport_agent: + +High Speed Transport Agent (HSTA) +--------------------------------- + +The HSTA is new in Dragon 0.4. The HSTA transport is an RDMA based transport agent that +combines MPI-like performance using Dragon Channels. There are no network ports to configure +for HSTA, but it does depend on Cray-MPICH being installed on the system. + +The HSTA transport agent is currently not available in the opensource version of Dragon. For +inquiries about Dragon's high speed RDMA-based transport, please contact HPE by emailing +dragonhpc@hpe.com. + +.. _tcp_transport_agent: + +TCP-based Transport Agent +------------------------- + +As of Dragon 0.5, the TCP-based transport agent is the default transport agent +for the Dragon opensource package. The TCP transport agent utilizes standard TCP +for inter-node communication through Dragon Channels. + +When using a version of Dragon that includes the HSTA transport agent and you prefer to +use the TCP transport agent, the `--transport tcp` option can be passed to the launcher (see: +:ref:`FAQ ` and :ref:`Launcher options `). + +The TCP agent is configured to use port 7575 by default. If that port is blocked, +it can be changed with the `--port` argument to `dragon`. If not specific, +7575 is used:, eg: + +.. code-block:: bash + + # Port 7575 used + $ dragon --nodes 2 p2p_lat.py --iterations 100 --lg_max_message_size 12 --dragon + + # Port 7000 used + $ dragon --port 7000 --nodes 2 p2p_lat.py --iterations 100 --lg_max_message_size 12 --dragon + +The TCP transport agent also favors known Cray high-speed interconnect networks by default. This is accomplished via +regex specification of the network's named prefix matchin `ipogif` (Aries) or `hsn` (Slingshot): `r'^(hsn|ipogif)\d+$'`. +To change, for example, to match only `hsn` networks, the `--network-prefix` argument could be used: + +.. code-block:: bash + + $ dragon --network-prefix hsn --nodes 2 p2p_lat.py --iterations 100 --lg_max_message_size 12 --dragon + + +**KNOWN ISSUE**: If a `--network-prefix` argument is given that doesn't actually exist, the Dragon runtime will enter +a hung state. This will be fixed in future releases. For now, a `ctrl+z` and `kill` will be necessary to recover. + Dragon Logging --------------- +============== The Dragon runtime has extensive internal logging for its services. For performance reasons, this is disabled by default. However for debugging, various levels of logging can be requested via `--log-level`. The specific levels match those in `Python's logging module`_. As some examples: diff --git a/examples/dragon_ai/README.md b/examples/dragon_ai/README.md new file mode 100644 index 0000000..fc29796 --- /dev/null +++ b/examples/dragon_ai/README.md @@ -0,0 +1,38 @@ +# Dragon AI API Examples + +The purpose of the example here is to show the usage of DragonTorch. This example provides an idea of how to use the PyTorch dataset stored with a Dragon distributed dictionary. Over multiple iterations, the processes interact with the same distributed dictionary. + +The correct version of PyTorch CUDA 11.8 is installed with this command. + +``` +pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 +``` + +## Example to demonstrate PyTorch Dataset Usage with Dragon Distributed Dictionary + +This example shows how a PyTorch dataset can use a Dragon distributed dictionary to store the data. In principle, the distributed dictionary could be shared among other processes that might interact with the training data between training iterations. The PyTorch Dataloader is used to iterate over the MNIST data. + +We run as follows: +``` +dict_torch_dataset.py [-h] [--mnist-workers MNIST_WORKERS] + [--devices-per-node DEVICES_PER_NODE] [--no-cuda] + [--epochs N] + [--dragon-dict-managers DRAGON_DICT_MANAGERS] + [--dragon-dict-mem DRAGON_DICT_MEM] +``` + +#### Optional arguments: +``` + -h, --help show this help message and exit + --mnist-workers MNIST_WORKERS + number of mnist workers (default: 2) + --devices-per-node DEVICES_PER_NODE + number of devices per node (default: 1) + --no-cuda disables CUDA training + --epochs N number of epochs to train (default: 5) + --dragon-dict-managers DRAGON_DICT_MANAGERS + number of dragon dictionary managers per node + --dragon-dict-mem DRAGON_DICT_MEM +``` + +The PyTorch dataset in the distributed Dragon dictionary in this example works on a multi-node setup. \ No newline at end of file diff --git a/examples/dragon_ai/dict_torch_dataset.py b/examples/dragon_ai/dict_torch_dataset.py new file mode 100644 index 0000000..9719e4f --- /dev/null +++ b/examples/dragon_ai/dict_torch_dataset.py @@ -0,0 +1,269 @@ +"""This example shows how a PyTorch dataset can use a Dragon distributed dictionary to store the data. In principle, the distributed dictionary could be shared among other processes that might interact with the training data between training iterations. +""" +import dragon +import multiprocessing as mp +from dragon.globalservices.node import get_list, query +import argparse +import functools +import os +import math +import queue +import dragon.ai.torch +from dragon.ai.torch.dictdataset import DragonDataset +import torch +import torch.multiprocessing as torch_mp +from torchvision import datasets, transforms +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torchvision import datasets, transforms +from torch.optim.lr_scheduler import StepLR + + +def get_args(): + """Get the user provided arguments + :return args: input args from command line + :rtype args: ArgumentParser object + """ + parser = argparse.ArgumentParser(description="Multi-client MNIST test with DragonDataset") + parser.add_argument("--mnist-workers", type=int, default=2, help="number of mnist workers (default: 2)") + parser.add_argument( + "--devices-per-node", type=int, default=1, help="number of devices per node (default: 1)" + ) + parser.add_argument("--no-cuda", action="store_true", default=False, help="disables CUDA training") + parser.add_argument( + "--epochs", type=int, default=5, metavar="N", help="number of epochs to train (default: 5)" + ) + parser.add_argument( + "--dragon-dict-managers", type=int, default=2, help="number of dragon dictionary managers per node" + ) + parser.add_argument( + "--dragon-dict-mem", + type=int, + default=1024 * 1024 * 1024, + help="total memory allocated to dragon dictionary", + ) + + my_args = parser.parse_args() + return my_args + + +class DragonDictArgs(object): + """Class for managing dragon distributed dictionary arguments.""" + + def __init__(self, managers_per_node: int, n_nodes: int, total_mem: int): + self.managers_per_node = managers_per_node + self.n_nodes = n_nodes + self.total_mem = total_mem + + +def build_device_queues(num_devices_per_node: int): + """Builds a dictionary of device queues. + + :param num_devices_per_node: A dictionary of multiprocessing queues that hold device numbers + :type num_devices_per_node: int + :return: A dictionary of multiprocessing queues that hold device numbers + :rtype: dict[mp.queues.Queue] + """ + node_dict = {} + node_list = get_list() + for node in node_list: + node_dict[node] = mp.Queue() + for node in node_list: + for device in range(num_devices_per_node): + node_dict[node].put(device) + return node_dict + + +def get_huid(): + """Gets huid for a worker's node. + + :return: returns h_uid + :rtype: int + """ + name = os.uname().nodename + desc = query(str(name)) + return desc.h_uid + + +def get_device(device_queue): + """Grabs an unoccupied device from the nodes unique queue if devices are available. Otherwise it returns the cpu as the available device. + + + :param device_queue: A dictionary of multiprocessing queues that hold device numbers + :type device_queue: dict[mp.queues.Queue] + :return: This processes device + :rtype: PyTorch device + """ + huid = get_huid() + try: + available_cuda_device = device_queue[huid].get(timeout=10) + gpu_available = True + except queue.Empty: + gpu_available = False + + if torch.cuda.is_available() and gpu_available: + device = torch.device("cuda", available_cuda_device) + else: + # if we don't have a device that is free, we use the cpu + device = torch.device("cpu") + return device + + +class Net(nn.Module): + """Convolutional neural network (two convolutional layers and two fully connected layers) + based on the PyTorch neural network module. The RelU activation function adds nonlinearity + and the max pool reduces the noise. The dropouts help reduce overfitting. + """ + + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 32, 3, 1) + self.conv2 = nn.Conv2d(32, 64, 3, 1) + self.dropout1 = nn.Dropout(0.25) + self.dropout2 = nn.Dropout(0.5) + self.fc1 = nn.Linear(9216, 128) + self.fc2 = nn.Linear(128, 10) + + def forward(self, x): + """Defines the computation done for a forward pass + + :param x: Input grayscaled image passed to the network + :type x: torch.Tensor + :return: Prediction + :rtype: torch.Tensor + """ + x = self.conv1(x) + x = F.relu(x) + x = self.conv2(x) + x = F.relu(x) + x = F.max_pool2d(x, 2) + x = self.dropout1(x) + x = torch.flatten(x, 1) + x = self.fc1(x) + x = F.relu(x) + x = self.dropout2(x) + x = self.fc2(x) + output = F.log_softmax(x, dim=1) + return output + + +def train(model, device, train_loader, optimizer, rank, epoch): + """Trains the model on the specified device. It utilizes a + PyTorch dataloader to iterate over the data + + :param model: Neural network model that defines layers and data flow + :type model: mnist.Net + :param device: PyTorch device to use for training + :type device: torch.device + :param train_loader: PyTorch data loader for training dataset + :type train_loader: torch.utils.data.dataloader.DataLoader + :param optimizer: PyTorch optimizer used to update model parameters + :type optimizer: torch.optim + :param rank: Global rank of this process + :type rank: int + :param epoch: Current epoch + :type epoch: int + """ + + model.train() + for batch_idx, (data, target) in enumerate(train_loader): + data, target = data.to(device), target.to(device) + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + if batch_idx % 100 == 0 or False: + print( + "Rank {} Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format( + rank, + epoch, + batch_idx * len(data), + len(train_loader.dataset), + 100.0 * batch_idx / len(train_loader), + loss.item(), + ), + flush=True, + ) + + +def mnist_lr_sweep(args, device_queue, dataset_train, lr_p_list, global_rank): + """This trains on the MNIST dataset with variable learning rate + and returns the final loss and accuracy. Each process gets a gpu + that is determined by first getting a unique node identifier and + then using that to access the queue of available devices. + + :param args: input args from command line + :type args: ArgumentParser object + :param device_queue: a dictionary of multiprocessing queues that hold device numbers + :type device_queue: dict[mp.queues.Queue] + :param dataset_train: the training dataset + :type dataset_train: PyTorch Dataset + :param lr_p_list: list of learning rates + :type lr_p_list: list of floats + :param global_rank: Global rank of this process + :type global_rank: int + + """ + torch_mp.set_start_method("dragon", force=True) + use_cuda = not args.no_cuda and torch.cuda.is_available() + # grabs an unoccupied device from the nodes unique queue + lr_p = lr_p_list[global_rank] + device = get_device(device_queue) + seed = math.floor(4099 * lr_p) + torch.manual_seed(seed) + + train_kwargs = {"batch_size": 64} + if use_cuda: + cuda_kwargs = { + "num_workers": 4, + "pin_memory": True, + "shuffle": True, + "multiprocessing_context": "dragon", + "persistent_workers": True, + } + train_kwargs.update(cuda_kwargs) + + train_loader = torch.utils.data.DataLoader(dataset_train, **train_kwargs) + + model = Net().to(device) + optimizer = optim.Adadelta(model.parameters(), lr=lr_p) + scheduler = StepLR(optimizer, step_size=1, gamma=0.7) + + for epoch in range(1, args.epochs + 1): + train(model, device, train_loader, optimizer, global_rank, epoch) + scheduler.step() + + +if __name__ == "__main__": + args = get_args() + mp.set_start_method("dragon") + + # get the list of nodes from Global Services + nodeslist = get_list() + nnodes = len(nodeslist) + + num_mnist_workers = args.mnist_workers + assert num_mnist_workers > 1 + print(f"Number of nodes: {nnodes}", flush=True) + print(f"Number of MNIST workers: {num_mnist_workers}", flush=True) + print(f"Number of dragon dict managers: {args.dragon_dict_managers*nnodes}", flush=True) + + transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]) + dataset_base_path = "./torch-data-dict/data" + dataset = datasets.MNIST(dataset_base_path, train=True, download=True, transform=transform) + + dragon_dict_args = DragonDictArgs(args.dragon_dict_managers, nnodes, args.dragon_dict_mem) + dragon_dataset = DragonDataset(dataset, dragon_dict_args=dragon_dict_args) + + device_queue = build_device_queues(args.devices_per_node) + lr_list = [1 / (num_mnist_workers - 1) * i + 0.5 for i in range(num_mnist_workers)] + mnist_lr_sweep_partial = functools.partial(mnist_lr_sweep, args, device_queue, dragon_dataset, lr_list) + mnist_pool = mp.Pool(num_mnist_workers) + + # launch scipy and mnist jobs + results = mnist_pool.map(mnist_lr_sweep_partial, [idx for idx in range(num_mnist_workers)], 1) + + mnist_pool.close() + mnist_pool.join() diff --git a/examples/jupyter/doc_ref/basic_pandarallel_demo.py b/examples/jupyter/doc_ref/basic_pandarallel_demo.py new file mode 100644 index 0000000..1f5b869 --- /dev/null +++ b/examples/jupyter/doc_ref/basic_pandarallel_demo.py @@ -0,0 +1,101 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# # `pandarallel` with Controlled Number of Progress Bars +# +# Up until at least version 1.6.4 of `pandarallel`, it displayed 1 progress bar from every 1 worker process. With a sufficiently large number of workers, this becomes overwhelming. +# +# This notebook demos a modification to `pandarallel` which exposes control over how many progress bars should be displayed and maps each worker process to one and only one of those progress bars. In a multi-node `dragon` execution configuration (which is _not_ demonstrated here), some nodes may be slower/faster than others and it may be helpful to see the relative progress/speed of one cluster's nodes versus others -- this motivates showing more than just a single progress bar representing all workers. + +# !pip install pandarallel + +# + +import dragon +import multiprocessing + +import cloudpickle + +import numpy as np +import pandas as pd +import time + +import pandarallel; pandarallel.__version__ +# - + +multiprocessing.set_start_method("dragon") +pandarallel.core.dill = cloudpickle +ctx = multiprocessing.get_context("dragon") +ctx.Manager = type("PMgr", (), {"Queue": ctx.Queue}) +pandarallel.core.CONTEXT = ctx +pandarallel.pandarallel.initialize(progress_bar=True) + +# + +num_rows = 10 + +df = pd.DataFrame( + { + "seqnum": np.arange(42, (42 + num_rows), dtype=int), + #"metric_A": np.random.rand(num_rows), + #"metric_B": np.random.rand(num_rows), + "metric_C": np.random.rand(num_rows), + "alt_seq": np.random.randint(low=42, high=(42 + num_rows), size=(num_rows,)), + "label": np.array(list("ATCG"))[np.random.randint(0, 4, num_rows)], + }, +) +# - + +df.head() + +# The use of a global variable inside a lambda function demonstrates key functionality from `cloudpickle` that is not otherwise available through `dill`. + +cutoff = 0.3 + +# Running this next cell will cause as many progress bars to be displayed as there are workers (potentially a lot). + +start = time.monotonic() +df['highlow_C'] = df['metric_C'].parallel_apply(lambda x: x < cutoff) +stop = time.monotonic() +tot_time = stop - start +time_dict = {} +time_dict["1"] = tot_time + +# Now we have our new column of values in our `pandas.DataFrame`. + +df.head() + +# We can change our minds about how many progress bars to display, at will. + +pandarallel.pandarallel.initialize(progress_bar=10) # Will display a total of 10 progress bars. + +start = time.monotonic() +df['highlow_C'] = df['metric_C'].parallel_apply(lambda x: x < cutoff) +stop = time.monotonic() +tot_time = stop - start +time_dict["2"] = tot_time + +# There will be plenty of use cases / scenarios where a single progress bar is all we want. + +pandarallel.pandarallel.initialize(progress_bar=1) # Will display 1 progress bar representing all workers. + +start = stop = time.monotonic() +df['highlow_C'] = df['metric_C'].parallel_apply(lambda x: x < cutoff) +stop = time.monotonic() +tot_time = stop - start +time_dict["3"] = tot_time + +print("parallel_apply","\t", "Time (nanoseconds)") +for key, value in time_dict.items(): + print("{:<20} {:<20}".format(key, value)) + +# Though it is very minor compared to the overall wall time, reducing the number of progress bars displayed can shave off a small amount of execution time. diff --git a/examples/jupyter/doc_ref/bioinformatics_alignment_pandarallel_demo.py b/examples/jupyter/doc_ref/bioinformatics_alignment_pandarallel_demo.py new file mode 100644 index 0000000..fb75ffc --- /dev/null +++ b/examples/jupyter/doc_ref/bioinformatics_alignment_pandarallel_demo.py @@ -0,0 +1,202 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# # Bioinformatics Alignment `pandarallel` Examples with Controlled Number of Progress Bars +# +# In the bioinformatics community, the `pandas` `DataFrame` is a popular tool for holding, manipulating, and performing computations on genomic sequence data and their associated properties. For larger datasets, the need to parallelize operations on DataFrames motivates the need for tools such as `pandarallel`. Because `pandarallel` is implemented against the standard Python `multiprocessing` library, this represents an opportunity for `dragon` to accelerate and enable greater scalability to users' code without necessarily requiring them to modify their code or patterns of thinking around their code. +# +# The example proposed uses a variant of NPSR1 linked to moderate/severe (stage III/IV) endometriosis, asthma, and sleep-related disorders. This variant is queried against a small dataset of nucleotide and protein sequences for the closest match. The closeness of the match is determined by the pairwise alignment, the E-value, and the percentage of match coverage. +# +# In a multi-node `dragon` execution configuration (which is _not_ demonstrated here), some nodes may be slower/faster than others and it may be helpful to see the relative progress/speed of one cluster's nodes versus others -- this motivates showing more than just a single progress bar representing all workers. +# +# The use case illustrates how parallel_apply from pandarallel is used for feature engineering for a k-means clustering use case. The features are commonly utilized in bioinformatics. + +# !pip install pandarallel +# !pip install biopython +# !pip install pyalign +# !pip install scikit-learn +# !pip install matplotlib +# !pip install seaborn + +# + +import dragon +import multiprocessing + +import cloudpickle + +import os +os.environ['OPENBLAS_NUM_THREADS'] = '1' + +import numpy as np +import pandas as pd + +import Bio +from Bio import SeqIO, Entrez +import pyalign +import time +import matplotlib.pyplot as plt +from sklearn.cluster import KMeans +import seaborn as sns +import os.path +import pandarallel; pandarallel.__version__ +# - + +# The NPSR1 variant, rs142885915, is linked to stage III/IV endometriosis. The GenBank record is linked here: https://www.ncbi.nlm.nih.gov/nuccore/KR711722 + +Entrez.email = raise Exception("need to set your email address here") +handle = Entrez.efetch(db="nuccore", id="KR711722", rettype="gb", retmode="text") +read = SeqIO.read(handle, "genbank") +handle.close() + +# A subset of the nucleotide (DNA + mRNA) database is created that is specific to NPSR1. The database is written to a csv file. If the database exists, it is read from the file. + +if os.path.isfile("NSPR1_genes_database.csv"): + handle = Entrez.esearch(db="nucleotide", term="npsr1[gene] AND mammals[ORGN]", retmax='100') + record = Entrez.read(handle) + nucl_identifiers = list(record["IdList"]) + handle.close() + endo_name, endo_nucl_seq, endo_descr = str(read.name), str(read.seq), str(read.description) + nucl_df = pd.read_csv("NSPR1_genes_database.csv", on_bad_lines='skip') +else: + handle = Entrez.esearch(db="nucleotide", term="npsr1[gene] AND mammals[ORGN]", retmax='100') + record = Entrez.read(handle) + nucl_identifiers = list(record["IdList"]) + handle.close() + endo_name, endo_nucl_seq, endo_descr = str(read.name), str(read.seq), str(read.description) + nucl_id_names, nucl_sequences, nucl_descriptions = [endo_name], [endo_nucl_seq], [endo_descr] + for idx, seq_id in enumerate(nucl_identifiers): + try: + handle = Entrez.efetch(db="nucleotide", id=seq_id, retmode='text', rettype='gb') + read = SeqIO.read(handle, "genbank") + nucl_id_names.append(str(read.name)) + nucl_sequences.append(str(read.seq)) + nucl_descriptions.append(str(read.description)) + handle.close() + except: + pass + nucl_df = pd.DataFrame(list(zip(nucl_id_names, nucl_sequences, nucl_descriptions)), columns=['ID Name','Sequence', 'Description']) + nucl_df.to_csv("NSPR1_genes_database.csv", index=False) + +nucl_df + +# The use of a global variable inside a lambda function demonstrates key functionality from `cloudpickle` that is not otherwise available through `dill`. There is one progress bar for each worker. + +multiprocessing.set_start_method("dragon") +pandarallel.core.dill = cloudpickle +ctx = multiprocessing.get_context("dragon") +ctx.Manager = type("PMgr", (), {"Queue": ctx.Queue}) +pandarallel.core.CONTEXT = ctx +pandarallel.pandarallel.initialize(progress_bar=True) + + +# The pairwise alignment algorithm from PyAlign can be used for either nucleotide or amino acid sequences to find similar regions in two sequences. The pairwise alignment score can point to similar functions, evolutionary origins, and structural elements in the two sequences. The higher the score, the better the alignment. + +def alignment_algorithm(sequence_1, sequence_2, gap): + alignment = pyalign.global_alignment(sequence_1, sequence_2, gap_cost=gap, eq=1, ne=-1) + return alignment.score + + +# Running this next cell will cause as many progress bars to be displayed as there are workers. + +start = time.monotonic() +nucl_df['PyAlign Alignment Score'] = nucl_df['Sequence'].parallel_apply(lambda seq2: alignment_algorithm(endo_nucl_seq, seq2, gap=0)) +stop = time.monotonic() +functions, bar_num, tot_time = ['PyAlign Alignment Score'],[128],[stop-start] + +# Now we have our new column of values in our `pandas.DataFrame` that shows the pairwise alignment from PyAlign. + +nucl_df.sort_values(by=['PyAlign Alignment Score'], inplace = True, ascending=False) +nucl_df = nucl_df[['ID Name','Sequence', 'PyAlign Alignment Score', 'Description']] +nucl_df.head() + +# We can change our minds about how many progress bars to display, at will. +# This will be used to calculate the E-value or the expect value. +# The E value is used to determine the number of hits one can expect to see when searching the database. As the score increases, the E value decreases. This means there is a reduction in noise. +# The smaller the E-value, the better the match. The E value is calculated with using the Jaccard distance. + +pandarallel.pandarallel.initialize(progress_bar=10) + + +def jaccard_similarity(list1, list2): + intersection = len(list(set(list1).intersection(list2))) + union = (len(set(list1)) + len(set(list2))) - intersection + return 1.0 - float(intersection) / union + + +start = time.monotonic() +nucl_df['E Value'] = nucl_df['Sequence'].parallel_apply(lambda seq2: jaccard_similarity(list(endo_nucl_seq), list(seq2))) +stop = time.monotonic() +functions.append("E Value") +bar_num.append(10) +tot_time.append(stop-start) + +nucl_df.sort_values(by=['E Value'], inplace = True, ascending=True) +nucl_df = nucl_df[['ID Name','Sequence', 'PyAlign Alignment Score','E Value', 'Description']] +nucl_df.head() + +# There will be plenty of use cases / scenarios where a single progress bar is all we want. For this use case, we will use the sequencing coverage percentage which provides the percentage of coverage of the aligned sequence reads. + +pandarallel.pandarallel.initialize(progress_bar=1) # Will display 1 progress bar representing all workers. + +start = time.monotonic() +nucl_df['Percentage Coverage'] = nucl_df['PyAlign Alignment Score'].parallel_apply(lambda match: 100*(float(match/len(endo_nucl_seq)))) +stop = time.monotonic() +functions.append("Percentage Coverage") +bar_num.append(1) +tot_time.append(stop-start) + +# The final nucleotide dataframe output shows the alignment, E value, and percentage coverage ordered by percentage coverage and E value. The best matches line up with the query sequence. + +nucl_df.sort_values(by=['Percentage Coverage'], inplace = True, ascending=False) +nucl_df = nucl_df[['ID Name','Sequence', 'PyAlign Alignment Score','E Value', 'Percentage Coverage', 'Description']] +nucl_df + +nucl_df.dtypes + +# Though it is very minor compared to the overall wall time, reducing the number of progress bars displayed can shave off a small amount of execution time. The time for the pandarallel parallel_apply for the respective applications is displayed in the pandas dataframe below. + +time_df = pd.DataFrame(list(zip(functions, bar_num, tot_time)), columns=['Pandarallel Function','Number of Bars', 'Time']) +time_sum = time_df['Time'].sum() +time_df.loc[len(time_df.index)] = ['Total Time for Dragon Multiprocessing Pandarallel Processes (Nucleotides)', "N/A", time_sum] +time_df + +# The correlation for the variables in the nucleotide pandas dataframe are plotted, and the variables for k-means clustering are identified. + +sns.PairGrid(nucl_df).map(sns.scatterplot); + +# The x-axis is the PyAlign Alignment Score, and the y-axis is percentage coverage. The scatterplot function from the seaborn library is used for k-means clustering using the variables identified. + +sns.scatterplot(data = nucl_df[['PyAlign Alignment Score', 'E Value', 'Percentage Coverage']], x = 'PyAlign Alignment Score', y = 'Percentage Coverage', hue = 'E Value') + +# The cluster number is determined from the elbow method and the default arguments for the k-means algorithm. + +# + +X = np.array(nucl_df.loc[:,['PyAlign Alignment Score', 'Percentage Coverage']]) + +euclidean = [] +for i in range(1, 10): + model = KMeans(n_clusters = i) + model.fit(X) + euclidean.append(model.inertia_) + +plt.plot(range(1, 10), euclidean) +plt.xlabel('Cluster number') +plt.ylabel('Euclidean Sum of Squares') +plt.show() +# - + +# The k-means algorithm is plotted, and the default arguments for the k-means algorithm is used. + +model = KMeans(n_clusters=3).fit(X) +plt.scatter(X[:,0], X[:,1], c=model.labels_.astype(float)) diff --git a/examples/jupyter/doc_ref/bioinformatics_alignment_pandarallel_multinode_demo.py b/examples/jupyter/doc_ref/bioinformatics_alignment_pandarallel_multinode_demo.py new file mode 100644 index 0000000..3d6c34c --- /dev/null +++ b/examples/jupyter/doc_ref/bioinformatics_alignment_pandarallel_multinode_demo.py @@ -0,0 +1,182 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.15.2 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# # Multi-Node Bioinformatics Alignment `pandarallel` Examples with Controlled Number of Progress Bars +# +# In the bioinformatics community, the `pandas` `DataFrame` is a popular tool for holding, manipulating, and performing computations on genomic sequence data and their associated properties. For larger datasets, the need to parallelize operations on DataFrames motivates the need for tools such as `pandarallel`. Because `pandarallel` is implemented against the standard Python `multiprocessing` library, this represents an opportunity for `dragon` to accelerate and enable greater scalability to users' code without necessarily requiring them to modify their code or patterns of thinking around their code. +# +# The example proposed uses a variant of NPSR1 linked to moderate/severe (stage III/IV) endometriosis, asthma, and sleep-related disorders. This variant is queried against a small dataset of nucleotide and protein sequences for the closest match. The closeness of the match is determined by the pairwise alignment, the E-value, and the percentage of match coverage. +# +# In a multi-node `dragon` execution configuration, some nodes may be slower/faster than others and it may be helpful to see the relative progress/speed of one cluster's nodes versus others -- this motivates showing more than just a single progress bar representing all workers. +# +# The use case illustrates how parallel_apply from pandarallel is used for feature engineering for a k-means clustering use case. The features are commonly utilized in bioinformatics. + +# !pip install pandarallel +# !pip install biopython +# !pip install pyalign +# !pip install scikit-learn +# !pip install matplotlib +# !pip install seaborn + +# + +import dragon +import multiprocessing + +import cloudpickle + +import os +os.environ['OPENBLAS_NUM_THREADS'] = '1' + +import numpy as np +import pandas as pd + +import Bio +from Bio import SeqIO, Entrez +import pyalign +import time +import matplotlib.pyplot as plt +from sklearn.cluster import KMeans +import seaborn as sns +import os.path +import pandarallel; pandarallel.__version__ +# - + +# The NPSR1 variant, rs142885915, is linked to stage III/IV endometriosis. The GenBank record is linked here: https://www.ncbi.nlm.nih.gov/protein/AKI72104.1 + +Entrez.email = raise Exception("need to set your email address here") +handle = Entrez.efetch(db="protein", id="AKI72104.1", rettype="gb", retmode="text") +read = SeqIO.read(handle, "genbank") +handle.close() +endo_name, endo_transl, endo_descr = str(read.name), str(read.seq), str(read.description) + +# A subset of the protein database is created that is specific to NPSR1. The database is written to a csv file. If the database exists, it is read from the file. + +if os.path.isfile("NSPR1_proteins_database.csv"): + aa_df = pd.read_csv("NSPR1_proteins_database.csv", on_bad_lines='skip') +else: + handle = Entrez.esearch(db="protein", term="npsr1[gene] AND mammals[ORGN]", retmax='100') + record = Entrez.read(handle) + aa_identifiers = list(record["IdList"]) + handle.close() + aa_id_names, aa_sequences, aa_descriptions = [endo_name], [endo_transl], [endo_descr] + for idx, seq_id in enumerate(aa_identifiers): + try: + handle = Entrez.efetch(db="protein", id=seq_id, retmode='text', rettype='gb') + read = SeqIO.read(handle, "genbank") + aa_id_names.append(str(read.name)) + aa_sequences.append(str(read.seq)) + aa_descriptions.append(str(read.description)) + handle.close() + except: + pass + aa_df = pd.DataFrame(list(zip(aa_id_names, aa_sequences, aa_descriptions)), columns=['ID Name','Sequence', 'Description']) + aa_df.to_csv("NSPR1_proteins_database.csv", index=False) + +aa_df + +# The use of a global variable inside a lambda function demonstrates key functionality from `cloudpickle` that is not otherwise available through `dill`. There is one progress bar for each worker. + +multiprocessing.set_start_method("dragon") +pandarallel.core.dill = cloudpickle +ctx = multiprocessing.get_context("dragon") +ctx.Manager = type("PMgr", (), {"Queue": ctx.Queue}) +pandarallel.core.CONTEXT = ctx +pandarallel.pandarallel.initialize(progress_bar=True) + + +# The pairwise alignment algorithm from PyAlign can be used for either nucleotide or amino acid sequences to find similar regions in two sequences. The pairwise alignment score can point to similar functions, evolutionary origins, and structural elements in the two sequences. The higher the score, the better the alignment. + +def alignment_algorithm(sequence_1, sequence_2, gap): + alignment = pyalign.global_alignment(sequence_1, sequence_2, gap_cost=gap, eq=1, ne=-1) + return alignment.score + + +# The E value is used to determine the number of hits one can expect to see when searching the database. As the score increases, the E value decreases. This means there is a reduction in noise. The smaller the E-value, the better the match. The E value is calculated with using the Jaccard distance. + +def jaccard_similarity(list1, list2): + intersection = len(list(set(list1).intersection(list2))) + union = (len(set(list1)) + len(set(list2))) - intersection + return 1.0 - float(intersection) / union + + +# The new column of values in our pandas.DataFrame that shows the pairwise alignment from PyAlign. + +start = time.monotonic() +aa_df['PyAlign Alignment Score'] = aa_df['Sequence'].parallel_apply(lambda seq2: alignment_algorithm(endo_transl, seq2, gap=0)) +stop = time.monotonic() +aa_functions, aa_bar_num, aa_tot_time = ['PyAlign Alignment Score'],[10],[stop-start] +aa_df.sort_values(by=['PyAlign Alignment Score'], inplace = True, ascending=False) +aa_df = aa_df[['ID Name','Sequence', 'PyAlign Alignment Score', 'Description']] +aa_df.head() + +# The new column of values in our pandas.DataFrame that shows the E value for the sequences. + +start = time.monotonic() +aa_df['E Value'] = aa_df['Sequence'].parallel_apply(lambda seq2: jaccard_similarity(list(endo_transl), list(seq2))) +stop = time.monotonic() +aa_functions.append('E Value') +aa_bar_num.append(10) +aa_tot_time.append(stop-start) +aa_df.sort_values(by=['E Value'], inplace = True, ascending=True) +aa_df = aa_df[['ID Name','Sequence', 'PyAlign Alignment Score','E Value', 'Description']] +aa_df.head() + +# For this new column in the pandas dataframe created from parallel_apply, we will use the sequencing coverage percentage which provides the percentage of coverage of the aligned sequence reads. The final protein dataframe output shows the alignment, E value, and percentage coverage ordered by percentage coverage and E value. The best matches line up with the query sequence. + +start = time.monotonic() +aa_df['Percentage Coverage'] = aa_df['PyAlign Alignment Score'].parallel_apply(lambda match: 100*(float(match/len(endo_transl)))) +stop = time.monotonic() +aa_functions.append('Percentage Coverage') +aa_bar_num.append(10) +aa_tot_time.append(stop-start) +aa_df.sort_values(by=['Percentage Coverage'], inplace = True, ascending=False) +aa_df = aa_df[['ID Name','Sequence', 'PyAlign Alignment Score','E Value', 'Percentage Coverage', 'Description']] +aa_df + +# The time for the pandarallel parallel_apply for the respective applications is displayed in the pandas dataframe below. + +std_time_df = pd.DataFrame(list(zip(aa_functions, aa_bar_num, aa_tot_time)), columns=['Pandarallel Function','Number of Bars', 'Time']) +std_time_sum = std_time_df['Time'].sum() +std_time_df.loc[len(std_time_df.index)] = ['Total Time for All Dragon Multiprocessing Pandarallel Processes (Amino Acids)', "N/A", std_time_sum] +std_time_df + +# The correlation for the variables in the nucleotide pandas dataframe are plotted, and the variables for k-means clustering are identified. + +sns.PairGrid(aa_df).map(sns.scatterplot); + +# The x-axis is the PyAlign Alignment Score, and the y-axis is percentage coverage. The scatterplot function from the seaborn library is used for k-means clustering using the variables identified. + +sns.scatterplot(data = aa_df[['PyAlign Alignment Score', 'E Value', 'Percentage Coverage']], x = 'PyAlign Alignment Score', y = 'Percentage Coverage', hue = 'E Value') + +# The cluster number is determined from the elbow method and the default arguments for the k-means algorithm. + +# + +X = np.array(aa_df.loc[:,['PyAlign Alignment Score', 'Percentage Coverage']]) + +euclidean = [] +for i in range(1, 10): + model = KMeans(n_clusters = i) + model.fit(X) + euclidean.append(model.inertia_) + +plt.plot(range(1, 10), euclidean) +plt.xlabel('Cluster number') +plt.ylabel('Euclidean Sum of Squares') +plt.show() +# - + +# The k-means algorithm is plotted, and the default arguments for the k-means algorithm is used. + +model = KMeans(n_clusters=3).fit(X) +plt.scatter(X[:,0], X[:,1], c=model.labels_.astype(float)) diff --git a/examples/multiprocessing/joblib/bench_auto_batching.py b/examples/multiprocessing/joblib/bench_auto_batching.py new file mode 100644 index 0000000..24c282a --- /dev/null +++ b/examples/multiprocessing/joblib/bench_auto_batching.py @@ -0,0 +1,101 @@ +""" +Joblib benchmark written by Olivier Grisel. Addition of Dragon libraries. + +Benchmark batching="auto" on high number of fast tasks + +The goal of this script is to study the behavior of the batch_size='auto' +and in particular the impact of the default value of the +joblib.parallel.MIN_IDEAL_BATCH_DURATION constant. + +Noop function to emulate real computation. Induce overhead by accepting (and ignoring) any amount of data as input and allocating a requested amount of data. The data is provided in bytes. + +First pair of benchmarks to check that the auto-batching strategy is stable (do not change the batch size too often) in the presence of large variance while still be comparable to the equivalent load without variance. + +Second pair of benchmarks: one has a cycling task duration pattern that the auto batching feature should be able to roughly track. The shuffle variant should not oscillate too much and still approximately have the same total run time. +""" + +import dragon +import multiprocessing as mp +import numpy as np +import time +import tempfile +from pprint import pprint +from joblib import Parallel, delayed +from joblib._parallel_backends import AutoBatchingMixin + + +def sleep_noop(duration, input_data, output_data_size): + time.sleep(duration) + if output_data_size: + return np.ones(output_data_size, dtype=np.byte) + + +def bench_short_tasks( + task_times, + n_jobs=2, + batch_size="auto", + pre_dispatch="2*n_jobs", + verbose=True, + input_data_size=0, + output_data_size=0, + backend=None, + memmap_input=False, +): + + with tempfile.NamedTemporaryFile() as temp_file: + if input_data_size: + if memmap_input: + temp_file.close() + input_data = np.memmap(temp_file.name, shape=input_data_size, dtype=np.byte, mode="w+") + input_data[:] = 1 + else: + input_data = np.ones(input_data_size, dtype=np.byte) + else: + input_data = None + + t0 = time.monotonic() + p = Parallel( + n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch, batch_size=batch_size, backend=backend + ) + p(delayed(sleep_noop)(max(t, 0), input_data, output_data_size) for t in task_times) + duration = time.monotonic() - t0 + effective_batch_size = getattr(p._backend, "_effective_batch_size", p.batch_size) + print( + "Completed {} tasks in {:3f}s, final batch_size={}\n".format( + len(task_times), duration, effective_batch_size + ) + ) + return duration, effective_batch_size + + +if __name__ == "__main__": + mp.set_start_method("dragon") + + bench_parameters = dict( + backend="multiprocessing", input_data_size=int(2e7), output_data_size=int(1e5), n_jobs=2, verbose=10, + ) + print("Common benchmark parameters:") + pprint(bench_parameters) + + AutoBatchingMixin.MIN_IDEAL_BATCH_DURATION = 0.2 + AutoBatchingMixin.MAX_IDEAL_BATCH_DURATION = 2 + + print("# high variance, no trend") + high_variance = np.random.normal(loc=0.000001, scale=0.001, size=5000) + high_variance[high_variance < 0] = 0 + + bench_short_tasks(high_variance, **bench_parameters) + print("# low variance, no trend") + low_variance = np.empty_like(high_variance) + low_variance[:] = np.mean(high_variance) + bench_short_tasks(low_variance, **bench_parameters) + + print("# cyclic trend") + slow_time = 0.1 + positive_wave = np.cos(np.linspace(1, 4 * np.pi, 300)) ** 8 + cyclic = positive_wave * slow_time + bench_short_tasks(cyclic, **bench_parameters) + + print("shuffling of the previous benchmark: same mean and variance") + np.random.shuffle(cyclic) + bench_short_tasks(cyclic, **bench_parameters) diff --git a/examples/multiprocessing/joblib/compressor_comparison.py b/examples/multiprocessing/joblib/compressor_comparison.py new file mode 100644 index 0000000..0a54fbd --- /dev/null +++ b/examples/multiprocessing/joblib/compressor_comparison.py @@ -0,0 +1,90 @@ +""" +=============================== +Improving I/O using compressors +=============================== + +This example compares the compressors available in Joblib. In the example, +Zlib, LZMA and LZ4 compression only are used but Joblib also supports BZ2 and +GZip compression methods. +For each compared compression method, this example dumps and reloads a +dataset fetched from an online machine-learning database. This gives 3 +information: the size on disk of the compressed data, the time spent to dump +and the time spent to reload the data from disk. +""" + +import dragon +import multiprocessing as mp +import os +import os.path +import time + +import pandas as pd +from joblib import dump, load + +if __name__ == "__main__": + mp.set_start_method("dragon") + url = "https://github.com/joblib/dataset/raw/main/kddcup.data.gz" + names = ( + "duration, protocol_type, service, flag, src_bytes, " + "dst_bytes, land, wrong_fragment, urgent, hot, " + "num_failed_logins, logged_in, num_compromised, " + "root_shell, su_attempted, num_root, " + "num_file_creations, " + ).split(", ") + + data = pd.read_csv(url, names=names, nrows=1e6) + + pickle_file = "./pickle_data.joblib" + start = time.monotonic() + with open(pickle_file, "wb") as f: + dump(data, f) + raw_dump_duration = time.monotonic() - start + print("Raw dump duration: %0.3fs" % raw_dump_duration) + raw_file_size = os.stat(pickle_file).st_size / 1e6 + print("Raw dump file size: %0.3fMB" % raw_file_size) + start = time.monotonic() + with open(pickle_file, "rb") as f: + load(f) + raw_load_duration = time.monotonic() - start + print("Raw load duration: %0.3fs" % raw_load_duration) + + start = time.monotonic() + with open(pickle_file, "wb") as f: + dump(data, f, compress="zlib") + zlib_dump_duration = time.monotonic() - start + print("Zlib dump duration: %0.3fs" % zlib_dump_duration) + + zlib_file_size = os.stat(pickle_file).st_size / 1e6 + print("Zlib file size: %0.3fMB" % zlib_file_size) + + start = time.monotonic() + with open(pickle_file, "rb") as f: + load(f) + zlib_load_duration = time.monotonic() - start + print("Zlib load duration: %0.3fs" % zlib_load_duration) + + start = time.monotonic() + with open(pickle_file, "wb") as f: + dump(data, f, compress=("lzma", 3)) + lzma_dump_duration = time.monotonic() - start + print("LZMA dump duration: %0.3fs" % lzma_dump_duration) + + lzma_file_size = os.stat(pickle_file).st_size / 1e6 + print("LZMA file size: %0.3fMB" % lzma_file_size) + + start = time.monotonic() + with open(pickle_file, "rb") as f: + load(f) + lzma_load_duration = time.monotonic() - start + print("LZMA load duration: %0.3fs" % lzma_load_duration) + + lz4_file_size = os.stat(pickle_file).st_size / 1e6 + print("LZ4 file size: %0.3fMB" % lz4_file_size) + + start = time.monotonic() + with open(pickle_file, "rb") as f: + load(f) + lz4_load_duration = time.monotonic() - start + print("LZ4 load duration: %0.3fs" % lz4_load_duration) + + os.remove(pickle_file) diff --git a/examples/multiprocessing/joblib/delayed_comparison.py b/examples/multiprocessing/joblib/delayed_comparison.py new file mode 100644 index 0000000..7c6f12b --- /dev/null +++ b/examples/multiprocessing/joblib/delayed_comparison.py @@ -0,0 +1,41 @@ +""" +Delayed is important for allowing optimization of the Python code in conjunction with +joblib's Parallel. Delayed is used to create a lazy or deferred function call which allows the +computations to be parallelized across multiple CPU cores or machines. +""" + +import dragon +import multiprocessing as mp +from joblib import Parallel, delayed +import time + +if __name__ == "__main__": + mp.set_start_method("dragon") + + def cube(x): + return x ** 3 + + def sleep_cube(x): + time.sleep(0.001) + return x ** 3 + + numbers = [*range(0, 10001, 1)] + + start = time.monotonic() + results_no_delayed = [sleep_cube(number) for number in numbers] + end = time.monotonic() + time_no_delayed = end - start + + delayed_calls = [delayed(cube)(number) for number in numbers] + start = time.monotonic() + results_delayed = Parallel(n_jobs=1, backend="multiprocessing")(delayed_calls) + end = time.monotonic() + time_delayed = end - start + + print("Results without delayed:", results_no_delayed) + print("\n") + print("Results with delayed: ", results_delayed) + print("\n") + print("Time without delayed: ", time_no_delayed) + print("\n") + print("Time with delayed: ", time_delayed) diff --git a/examples/multiprocessing/joblib/memory_basic_usage.py b/examples/multiprocessing/joblib/memory_basic_usage.py new file mode 100644 index 0000000..26ef92d --- /dev/null +++ b/examples/multiprocessing/joblib/memory_basic_usage.py @@ -0,0 +1,85 @@ +""" +Adapted from joblib memory basic usage example. +This example illustrates the usage of :class:`joblib.Memory` with both functions and methods. +Be sure to set the random seed to generate deterministic data. Indeed, if the data is not deterministic, the :class:`joblib.Memory` instance will not be able to reuse the cache from one run to another. +Therefore, the computation time corresponds to the time to compute the results plus the time to dump the disk. +At the second call, the computation time is largely reduced since the results are obtained by loading the data previously dumped to the disk instead of recomputing the results. +""" + +import dragon +import multiprocessing as mp +import time +import numpy as np + +if __name__ == "__main__": + + mp.set_start_method("dragon") + + def costly_compute(data, column_index=0): + time.sleep(5) + return data[column_index] + + rng = np.random.RandomState(42) + data = rng.randn(int(1e5), 10) + start = time.monotonic() + data_trans = costly_compute(data) + end = time.monotonic() + + print("\nThe function took {:.2f} s to compute.".format(end - start)) + print("\nThe transformed data are:\n {}".format(data_trans)) + + from joblib import Memory + + location = "./cachedir" + memory = Memory(location, verbose=0) + + def costly_compute_cached(data, column_index=0): + """Simulate an expensive computation""" + time.sleep(5) + return data[column_index] + + costly_compute_cached = memory.cache(costly_compute_cached) + start = time.monotonic() + data_trans = costly_compute_cached(data) + end = time.monotonic() + + print("\nThe function took {:.2f} s to compute.".format(end - start)) + print("\nThe transformed data are:\n {}".format(data_trans)) + + start = time.monotonic() + data_trans = costly_compute_cached(data) + end = time.monotonic() + + print("\nThe function took {:.2f} s to compute.".format(end - start)) + print("\nThe transformed data are:\n {}".format(data_trans)) + + def _costly_compute_cached(data, column): + time.sleep(5) + return data[column] + + class Algorithm(object): + """A class which is using the previous function.""" + + def __init__(self, column=0): + self.column = column + + def transform(self, data): + costly_compute = memory.cache(_costly_compute_cached) + return costly_compute(data, self.column) + + transformer = Algorithm() + start = time.monotonic() + data_trans = transformer.transform(data) + end = time.monotonic() + + print("\nThe function took {:.2f} s to compute.".format(end - start)) + print("\nThe transformed data are:\n {}".format(data_trans)) + + start = time.monotonic() + data_trans = transformer.transform(data) + end = time.monotonic() + + print("\nThe function took {:.2f} s to compute.".format(end - start)) + print("\nThe transformed data are:\n {}".format(data_trans)) + + memory.clear(warn=False) diff --git a/examples/multiprocessing/joblib/nested_parallel_memory.py b/examples/multiprocessing/joblib/nested_parallel_memory.py new file mode 100644 index 0000000..6d6035c --- /dev/null +++ b/examples/multiprocessing/joblib/nested_parallel_memory.py @@ -0,0 +1,133 @@ +""" +This example from the joblib package illustrates how to cache intermediate computing results using +:class:`joblib.Memory` within :class:`joblib.Parallel`. Processing is executed in parallel with caching for the deterministic data. + +""" + +import dragon +import multiprocessing as mp +import time +from joblib import Memory, Parallel, delayed +import numpy as np +import time + +if __name__ == "__main__": + mp.set_start_method("dragon") + + def costly_compute(data, column): + time.sleep(2) + return data[column] + + def data_processing_mean(data, column): + return costly_compute(data, column).mean() + + rng = np.random.RandomState(42) + data = rng.randn(int(1e4), 4) + + start = time.monotonic() + results = [data_processing_mean(data, col) for col in range(data.shape[1])] + stop = time.monotonic() + + print("\nSequential processing") + print("Elapsed time for the entire processing: {:.2f} s".format(stop - start)) + + location = "./cachedir" + memory = Memory(location, verbose=0) + costly_compute_cached = memory.cache(costly_compute) + + def data_processing_mean_using_cache(data, column): + """Compute the mean of a column.""" + return costly_compute_cached(data, column).mean() + + start = time.monotonic() + results = Parallel(n_jobs=2, backend="multiprocessing")( + delayed(data_processing_mean_using_cache)(data, col) for col in range(data.shape[1]) + ) + stop = time.monotonic() + + print("\nFirst round - caching the data") + print("Elapsed time for the entire processing: {:.2f} s".format(stop - start)) + + start = time.monotonic() + results = Parallel(n_jobs=2, backend="multiprocessing")( + delayed(data_processing_mean_using_cache)(data, col) for col in range(data.shape[1]) + ) + stop = time.monotonic() + + print("\nSecond round - reloading from the cache") + print("Elapsed time for the entire processing: {:.2f} s".format(stop - start)) + + def data_processing_max_using_cache(data, column): + """Compute the max of a column.""" + return costly_compute_cached(data, column).max() + + start = time.monotonic() + results = Parallel(n_jobs=2, backend="multiprocessing")( + delayed(data_processing_max_using_cache)(data, col) for col in range(data.shape[1]) + ) + stop = time.monotonic() + + print("\nReusing intermediate checkpoints") + print("Elapsed time for the entire processing: {:.2f} s".format(stop - start)) + + memory.clear(warn=False) + + def costly_compute(data, column): + """Emulate a costly function by sleeping and returning a column.""" + time.sleep(2) + return data[column] + + def data_processing_mean(data, column): + """Compute the mean of a column.""" + return costly_compute(data, column).mean() + + rng = np.random.RandomState(42) + data = rng.randn(int(1e4), 4) + + start = time.monotonic() + results = [data_processing_mean(data, col) for col in range(data.shape[1])] + stop = time.monotonic() + + print("\nSequential processing") + print("Elapsed time for the entire processing: {:.2f} s".format(stop - start)) + + location = "./cachedir" + memory = Memory(location, verbose=0) + costly_compute_cached = memory.cache(costly_compute) + + def data_processing_mean_using_cache(data, column): + """Compute the mean of a column.""" + return costly_compute_cached(data, column).mean() + + start = time.monotonic() + results = Parallel(n_jobs=2, backend="multiprocessing")( + delayed(data_processing_mean_using_cache)(data, col) for col in range(data.shape[1]) + ) + stop = time.monotonic() + + print("\nFirst round - caching the data") + print("Elapsed time for the entire processing: {:.2f} s".format(stop - start)) + + start = time.monotonic() + results = Parallel(n_jobs=2, backend="multiprocessing")( + delayed(data_processing_mean_using_cache)(data, col) for col in range(data.shape[1]) + ) + stop = time.monotonic() + + print("\nSecond round - reloading from the cache") + print("Elapsed time for the entire processing: {:.2f} s".format(stop - start)) + + def data_processing_max_using_cache(data, column): + """Compute the max of a column.""" + return costly_compute_cached(data, column).max() + + start = time.monotonic() + results = Parallel(n_jobs=2, backend="multiprocessing")( + delayed(data_processing_max_using_cache)(data, col) for col in range(data.shape[1]) + ) + stop = time.monotonic() + + print("\nReusing intermediate checkpoints") + print("Elapsed time for the entire processing: {:.2f} s".format(stop - start)) + + memory.clear(warn=False) diff --git a/examples/multiprocessing/joblib/parallel_memmap.py b/examples/multiprocessing/joblib/parallel_memmap.py new file mode 100644 index 0000000..d31eccb --- /dev/null +++ b/examples/multiprocessing/joblib/parallel_memmap.py @@ -0,0 +1,76 @@ +""" +This example from joblib illustrates some features enabled by using a memory map +(:class:`numpy.memmap`) within :class:`joblib.Parallel`. First, we show that +dumping a huge data array ahead of passing it to :class:`joblib.Parallel` +speeds up computation. Then, we show the possibility to provide write access to +original data. + +""" +import dragon +import multiprocessing as mp + +import numpy as np +import shutil +import time +import os +from joblib import Parallel, delayed, dump, load + +if __name__ == "__main__": + mp.set_start_method("dragon") + + data = np.random.random((int(1e7),)) + window_size = int(5e5) + slices = [slice(start, start + window_size) for start in range(0, data.size - window_size, int(1e5))] + + def slow_mean(data, sl): + time.sleep(0.01) + return data[sl].mean() + + start = time.monotonic() + results = [slow_mean(data, sl) for sl in slices] + stop = time.monotonic() + print("\nElapsed time computing the average of couple of slices {:.2f} s".format(stop - start)) + + tic = time.monotonic() + results = Parallel(n_jobs=2, backend="multiprocessing")(delayed(slow_mean)(data, sl) for sl in slices) + toc = time.monotonic() + print("\nElapsed time computing the average of couple of slices {:.2f} s".format(toc - tic)) + + folder = "./joblib_memmap" + try: + os.mkdir(folder) + except FileExistsError: + pass + + data_filename_memmap = os.path.join(folder, "data_memmap") + dump(data, data_filename_memmap) + data = load(data_filename_memmap, mmap_mode="r") + + tic = time.monotonic() + results = Parallel(n_jobs=2, backend="multiprocessing")(delayed(slow_mean)(data, sl) for sl in slices) + toc = time.monotonic() + print("\nElapsed time computing the average of couple of slices {:.2f} s\n".format(toc - tic)) + + def slow_mean_write_output(data, sl, output, idx): + time.sleep(0.005) + res_ = data[sl].mean() + print("[Worker %d] Mean for slice %d is %f" % (os.getpid(), idx, res_)) + output[idx] = res_ + + output_filename_memmap = os.path.join(folder, "output_memmap") + + output = np.memmap(output_filename_memmap, dtype=data.dtype, shape=len(slices), mode="w+") + + data = load(data_filename_memmap, mmap_mode="r") + + Parallel(n_jobs=2, backend="multiprocessing")( + delayed(slow_mean_write_output)(data, sl, output, idx) for idx, sl in enumerate(slices) + ) + + print("\nExpected means computed in the parent process:\n {}".format(np.array(results))) + print("\nActual means computed by the worker processes:\n {}".format(output)) + + try: + shutil.rmtree(folder) + except: + print("Could not clean-up automatically.") diff --git a/examples/multiprocessing/joblib/parallel_random_state.py b/examples/multiprocessing/joblib/parallel_random_state.py new file mode 100644 index 0000000..6508c3c --- /dev/null +++ b/examples/multiprocessing/joblib/parallel_random_state.py @@ -0,0 +1,66 @@ +""" +Randomness is affected by parallel execution differently by the different +backends. + +In particular, when using multiple processes, the random sequence can be +the same in all processes. This example from joblib illustrates the problem and shows +how to work around it. +""" + +import dragon +import multiprocessing as mp + +import time +import numpy as np +from joblib import Parallel, delayed + +if __name__ == "__main__": + + mp.set_start_method("dragon") + + def print_vector(vector, backend): + """Helper function to print the generated vector with a given backend.""" + print( + "\nThe different generated vectors using the {} backend are:\n {}".format( + backend, np.array(vector) + ) + ) + + def stochastic_function(max_value): + """Randomly generate integer up to a maximum value.""" + return np.random.randint(max_value, size=5) + + n_vectors = 5 + random_vector = [stochastic_function(10) for _ in range(n_vectors)] + print( + "\nThe different generated vectors in a sequential manner are:\n {}".format(np.array(random_vector)) + ) + + start = time.monotonic() + random_vector = Parallel(n_jobs=2, backend="multiprocessing")( + delayed(stochastic_function)(10) for _ in range(n_vectors) + ) + stop = time.monotonic() + print(stop - start) + print_vector(random_vector, "multiprocessing") + + def stochastic_function_seeded(max_value, random_state): + rng = np.random.RandomState(random_state) + return rng.randint(max_value, size=5) + + start = time.monotonic() + random_vector = Parallel(n_jobs=2, backend="multiprocessing")( + delayed(stochastic_function_seeded)(10, None) for _ in range(n_vectors) + ) + stop = time.monotonic() + print(stop - start) + print_vector(random_vector, "multiprocessing") + + random_state = np.random.randint(np.iinfo(np.int32).max, size=n_vectors) + start = time.monotonic() + random_vector = Parallel(n_jobs=2, backend="multiprocessing")( + delayed(stochastic_function_seeded)(10, rng) for rng in random_state + ) + stop = time.monotonic() + print(stop - start) + print_vector(random_vector, "multiprocessing") diff --git a/examples/multiprocessing/joblib/serialization_and_wrappers.py b/examples/multiprocessing/joblib/serialization_and_wrappers.py new file mode 100644 index 0000000..16b2a4e --- /dev/null +++ b/examples/multiprocessing/joblib/serialization_and_wrappers.py @@ -0,0 +1,44 @@ +""" +This example by Thomas Moreau highlights the options for tempering with joblib serialization +process. + +""" + +import dragon +import multiprocessing as mp + +import sys +import time +import traceback +from joblib.externals.loky import set_loky_pickler +from joblib import parallel_config +from joblib import Parallel, delayed +from joblib import wrap_non_picklable_objects + +if __name__ == "__main__": + mp.set_start_method("dragon") + + def func_async(i, *args): + return 2 * i + + print(Parallel(n_jobs=2, backend="multiprocessing")(delayed(func_async)(21) for _ in range(1))[0]) + + large_list = list(range(1000000)) + + t_start = time.monotonic() + Parallel(n_jobs=2, backend="multiprocessing")(delayed(func_async)(21, large_list) for _ in range(1)) + print("With loky backend and cloudpickle serialization: {:.3f}s".format(time.monotonic() - t_start)) + + with parallel_config("multiprocessing"): + t_start = time.monotonic() + Parallel(n_jobs=2, backend="multiprocessing")(delayed(func_async)(21, large_list) for _ in range(1)) + print( + "With multiprocessing backend and pickle serialization: {:.3f}s".format( + time.monotonic() - t_start + ) + ) + + set_loky_pickler("pickle") + t_start = time.monotonic() + Parallel(n_jobs=2, backend="multiprocessing")(delayed(id)(large_list) for _ in range(1)) + print("With pickle serialization: {:.3f}s".format(time.monotonic() - t_start)) diff --git a/examples/multiprocessing/p2p_lat.py b/examples/multiprocessing/p2p_lat.py index e45a387..7adfa78 100644 --- a/examples/multiprocessing/p2p_lat.py +++ b/examples/multiprocessing/p2p_lat.py @@ -5,7 +5,7 @@ import time import argparse -BURN_ITERS = 2 +BURN_ITERS = 1 def worker_conn(id, send_link, recv_link, result_link, msg_size, total_iterations, use_bytes): @@ -78,7 +78,7 @@ def run_p2p_lat(iterations=100, max_msg_sz=1024, use_bytes=False, with_queues=Fa right_left_link = mp.Pipe(duplex=False) msg_sz = 2 - print(f"Msglen [B] Lat [usec]") + print(f"Msglen [B] Lat [usec]", flush=True) while msg_sz <= max_msg_sz: if with_queues: proc0 = mp.Process(target=worker_queue, args=(0, q1, q0, result_links[0][1], msg_sz, iterations)) @@ -120,7 +120,7 @@ def run_p2p_lat(iterations=100, max_msg_sz=1024, use_bytes=False, with_queues=Fa proc0.join() proc1.join() - print(f"{msg_sz} {time_avg}") + print(f"{msg_sz} {time_avg}", flush=True) msg_sz *= 2 @@ -146,11 +146,11 @@ def run_p2p_lat(iterations=100, max_msg_sz=1024, use_bytes=False, with_queues=Fa my_args = parser.parse_args() if my_args.dragon: - print("using Dragon") + print("using Dragon", flush=True) mp.set_start_method("dragon") else: - print("using multiprocessing") + print("using multiprocessing", flush=True) mp.set_start_method("spawn") run_p2p_lat( diff --git a/examples/multiprocessing/unittests/common.py b/examples/multiprocessing/unittests/common.py index 0ea92a6..47f5ead 100644 --- a/examples/multiprocessing/unittests/common.py +++ b/examples/multiprocessing/unittests/common.py @@ -9,7 +9,14 @@ import gc import test.support -from test import support +try: + from test.support.import_helper import import_module + from test.support.threading_helper import join_thread +except ImportError: + #location prior to Python 3.10 + from test.support import import_module + from test.support import join_thread + import threading @@ -22,10 +29,10 @@ from multiprocessing.connection import wait # Skip tests if _multiprocessing wasn't built. -_multiprocessing = test.support.import_module("_multiprocessing") +_multiprocessing = import_module("_multiprocessing") # Skip tests if sem_open implementation is broken. -support.skip_if_broken_multiprocessing_synchronize() +test.support.skip_if_broken_multiprocessing_synchronize() def latin(s): @@ -42,7 +49,7 @@ def join_process(process): """Since multiprocessing.Process has the same API than threading.Thread (join() and is_alive(), the support function can be reused """ - support.join_thread(process) + join_thread(process) # @@ -186,13 +193,13 @@ def tearDownClass(cls): processes = set(multiprocessing.process._dangling) - set(cls.dangling[0]) if processes: test.support.environment_altered = True - support.print_warning(f"Dangling processes: {processes}") + test.support.print_warning(f"Dangling processes: {processes}") processes = None threads = set(threading._dangling) - set(cls.dangling[1]) if threads: test.support.environment_altered = True - support.print_warning(f"Dangling threads: {threads}") + test.support.print_warning(f"Dangling threads: {threads}") threads = None @@ -262,7 +269,7 @@ def tearDownClass(cls): dt = time.monotonic() - start_time if dt >= 5.0: test.support.environment_altered = True - support.print_warning( + test.support.print_warning( f"multiprocessing.Manager still has " f"{multiprocessing.active_children()} " f"active children after {dt} seconds" @@ -275,8 +282,8 @@ def tearDownClass(cls): # ensure that all processes which hold a reference to a # managed object have been joined. test.support.environment_altered = True - support.print_warning("Shared objects which still exist " "at manager shutdown:") - support.print_warning(cls.manager._debug_info()) + test.support.print_warning("Shared objects which still exist " "at manager shutdown:") + test.support.print_warning(cls.manager._debug_info()) cls.manager.shutdown() cls.manager.join() cls.manager = None @@ -354,14 +361,14 @@ def tearDownModule(): if processes: need_sleep = True test.support.environment_altered = True - support.print_warning(f"Dangling processes: {processes}") + test.support.print_warning(f"Dangling processes: {processes}") processes = None threads = set(threading._dangling) - set(dangling[1]) if threads: need_sleep = True test.support.environment_altered = True - support.print_warning(f"Dangling threads: {threads}") + test.support.print_warning(f"Dangling threads: {threads}") threads = None # Sleep 500 ms to give time to child processes to complete. diff --git a/examples/multiprocessing/unittests/test_condition.py b/examples/multiprocessing/unittests/test_condition.py index 745a6f1..9cb85a6 100644 --- a/examples/multiprocessing/unittests/test_condition.py +++ b/examples/multiprocessing/unittests/test_condition.py @@ -6,7 +6,7 @@ import os import signal -from test import support +import test.support import threading @@ -290,7 +290,7 @@ def test_waitfor_timeout(self): p = self.Process(target=self._test_waitfor_timeout_f, args=(cond, state, success, sem)) p.daemon = True p.start() - self.assertTrue(sem.acquire(timeout=support.LONG_TIMEOUT)) + self.assertTrue(sem.acquire(timeout=test.support.LONG_TIMEOUT)) # Only increment 3 times, so state == 4 is never reached. for i in range(3): @@ -584,7 +584,7 @@ def test_waitfor_timeout(self): p = self.Process(target=self._test_waitfor_timeout_f, args=(cond, state, success, sem)) p.daemon = True p.start() - self.assertTrue(sem.acquire(timeout=support.LONG_TIMEOUT)) + self.assertTrue(sem.acquire(timeout=test.support.LONG_TIMEOUT)) # Only increment 3 times, so state == 4 is never reached. for i in range(3): @@ -878,7 +878,7 @@ def test_waitfor_timeout(self): p = self.Process(target=self._test_waitfor_timeout_f, args=(cond, state, success, sem)) p.daemon = True p.start() - self.assertTrue(sem.acquire(timeout=support.LONG_TIMEOUT)) + self.assertTrue(sem.acquire(timeout=test.support.LONG_TIMEOUT)) # Only increment 3 times, so state == 4 is never reached. for i in range(3): diff --git a/examples/multiprocessing/unittests/test_connection.py b/examples/multiprocessing/unittests/test_connection.py index 4cb6109..495beef 100644 --- a/examples/multiprocessing/unittests/test_connection.py +++ b/examples/multiprocessing/unittests/test_connection.py @@ -14,7 +14,6 @@ msvcrt = None import test.support -from test import support from test.support import hashlib_helper from test.support import socket_helper @@ -327,7 +326,7 @@ class WithProcessesTestPicklingConnections(BaseTestCase, ProcessesMixin, unittes def tearDownClass(cls): from multiprocessing import resource_sharer - resource_sharer.stop(timeout=support.LONG_TIMEOUT) + resource_sharer.stop(timeout=test.support.LONG_TIMEOUT) @classmethod def _listener(cls, conn, families): diff --git a/examples/multiprocessing/unittests/test_listener.py b/examples/multiprocessing/unittests/test_listener.py index 01491a9..35fcc12 100644 --- a/examples/multiprocessing/unittests/test_listener.py +++ b/examples/multiprocessing/unittests/test_listener.py @@ -4,7 +4,6 @@ import time import socket -from test import support from test.support import hashlib_helper from test.support import socket_helper diff --git a/examples/multiprocessing/unittests/test_others.py b/examples/multiprocessing/unittests/test_others.py index 8394346..71341dc 100644 --- a/examples/multiprocessing/unittests/test_others.py +++ b/examples/multiprocessing/unittests/test_others.py @@ -20,8 +20,6 @@ import unittest import unittest.mock import test.support -import test.support.script_helper -from test import support from test.support import hashlib_helper from test.support import socket_helper @@ -617,7 +615,7 @@ def test_closefd(self): class TestIgnoreEINTR(unittest.TestCase): # Sending CONN_MAX_SIZE bytes into a multiprocessing pipe must block - CONN_MAX_SIZE = max(support.PIPE_MAX_SIZE, support.SOCK_MAX_SIZE) + CONN_MAX_SIZE = max(test.support.PIPE_MAX_SIZE, test.support.SOCK_MAX_SIZE) @classmethod def _test_ignore(cls, conn): @@ -817,7 +815,7 @@ def create_and_register_resource(rtype): p.terminate() p.wait() - deadline = time.monotonic() + support.LONG_TIMEOUT + deadline = time.monotonic() + test.support.LONG_TIMEOUT while time.monotonic() < deadline: time.sleep(0.5) try: @@ -845,7 +843,7 @@ def check_resource_tracker_death(self, signum, should_die): pid = _resource_tracker._pid if pid is not None: os.kill(pid, signal.SIGKILL) - support.wait_process(pid, exitcode=-signal.SIGKILL) + test.support.wait_process(pid, exitcode=-signal.SIGKILL) with warnings.catch_warnings(): warnings.simplefilter("ignore") _resource_tracker.ensure_running() @@ -1047,7 +1045,7 @@ def tearDown(self): @classmethod def setUpClass(cls): - support.reap_children() + test.support.reap_children() tearDownClass = setUpClass @@ -1064,7 +1062,7 @@ def wait_proc_exit(self): dt = time.monotonic() - start_time if dt >= 5.0: test.support.environment_altered = True - support.print_warning( + test.support.print_warning( f"multiprocessing.Manager still has " f"{multiprocessing.active_children()} " f"active children after {dt} seconds" @@ -1247,9 +1245,15 @@ def test_namespace(self): class MiscTestCase(unittest.TestCase): def test__all__(self): # Just make sure names in blacklist are excluded - support.check__all__( - self, multiprocessing, extra=multiprocessing.__all__, blacklist=["SUBDEBUG", "SUBWARNING"] - ) + try: + test.support.check__all__( + self, multiprocessing, extra=multiprocessing.__all__, not_exported=["SUBDEBUG", "SUBWARNING"] + ) + except TypeError: + #kwargs prior to Python 3.10 + test.support.check__all__( + self, multiprocessing, extra=multiprocessing.__all__, blacklist=["SUBDEBUG", "SUBWARNING"] + ) # diff --git a/examples/multiprocessing/unittests/test_pool.py b/examples/multiprocessing/unittests/test_pool.py index 66239dd..392df9f 100644 --- a/examples/multiprocessing/unittests/test_pool.py +++ b/examples/multiprocessing/unittests/test_pool.py @@ -10,7 +10,6 @@ import test.support import test.support.script_helper from test.support import hashlib_helper -from test import support import dragon # DRAGON import before multiprocessing @@ -261,7 +260,7 @@ def test_imap_unordered_handle_iterable_exception(self): self.assertIn(value, expected_values) expected_values.remove(value) - + def test_make_pool(self): expected_error = RemoteError if self.TYPE == "manager" else ValueError @@ -424,9 +423,9 @@ def test_resource_warning(self): # force state to RUN to emit ResourceWarning in __del__() pool._state = multiprocessing.pool.RUN - with support.check_warnings(("unclosed running multiprocessing pool", ResourceWarning)): + with test.support.check_warnings(("unclosed running multiprocessing pool", ResourceWarning)): pool = None - support.gc_collect() + test.support.gc_collect() def raising(): @@ -487,7 +486,6 @@ class WithProcessesTestPoolWorkerLifetime(BaseTestCase, ProcessesMixin, unittest # DRAGON ALLOWED_TYPES = ('processes', ) - @unittest.skip(f"bug filed CIRRUS-1473") def test_pool_worker_lifetime(self): p = multiprocessing.Pool(3, maxtasksperchild=10) self.assertEqual(3, len(p._pool)) @@ -895,9 +893,9 @@ def test_resource_warning(self): # force state to RUN to emit ResourceWarning in __del__() pool._state = multiprocessing.pool.RUN - with support.check_warnings(("unclosed running multiprocessing pool", ResourceWarning)): + with test.support.check_warnings(("unclosed running multiprocessing pool", ResourceWarning)): pool = None - support.gc_collect() + test.support.gc_collect() @unittest.skip("DRAGON: Threads not implemented") @@ -1245,9 +1243,9 @@ def test_resource_warning(self): # force state to RUN to emit ResourceWarning in __del__() pool._state = multiprocessing.pool.RUN - with support.check_warnings(("unclosed running multiprocessing pool", ResourceWarning)): + with test.support.check_warnings(("unclosed running multiprocessing pool", ResourceWarning)): pool = None - support.gc_collect() + test.support.gc_collect() # DRAGON if __name__ == "__main__": diff --git a/examples/multiprocessing/unittests/test_process.py b/examples/multiprocessing/unittests/test_process.py index d6ac5b0..3681eb1 100644 --- a/examples/multiprocessing/unittests/test_process.py +++ b/examples/multiprocessing/unittests/test_process.py @@ -10,7 +10,17 @@ import unittest import test.support -from test import support + +try: + from test.support.os_helper import fd_count + from test.support.os_helper import TESTFN + from test.support.os_helper import unlink +except ImportError: + #location prior to Python 3.10 + from test.support import fd_count + from test.support import TESTFN + from test.support import unlink + import threading @@ -117,7 +127,7 @@ def test_parent_process(self): p = self.Process(target=self._test_create_grandchild_process, args=(wconn,)) p.start() - if not rconn.poll(timeout=support.LONG_TIMEOUT): + if not rconn.poll(timeout=test.support.LONG_TIMEOUT): raise AssertionError("Could not communicate with child process") parent_process_status = rconn.recv() self.assertEqual(parent_process_status, "alive") @@ -125,7 +135,7 @@ def test_parent_process(self): p.terminate() p.join() - if not rconn.poll(timeout=support.LONG_TIMEOUT): + if not rconn.poll(timeout=test.support.LONG_TIMEOUT): raise AssertionError("Could not communicate with child process") parent_process_status = rconn.recv() self.assertEqual(parent_process_status, "not alive") @@ -141,7 +151,7 @@ def _test_report_parent_status(cls, wconn): from multiprocessing.process import parent_process wconn.send("alive" if parent_process().is_alive() else "not alive") - parent_process().join(timeout=support.SHORT_TIMEOUT) + parent_process().join(timeout=test.support.SHORT_TIMEOUT) wconn.send("alive" if parent_process().is_alive() else "not alive") def test_process(self): @@ -423,7 +433,7 @@ def test_lose_target_ref(self): @classmethod def _test_child_fd_inflation(self, evt, q): - q.put(test.support.fd_count()) + q.put(fd_count()) evt.wait() def test_child_fd_inflation(self): @@ -625,8 +635,8 @@ def test_stderr_flush(self): if self.TYPE == "threads": self.skipTest("test not appropriate for {}".format(self.TYPE)) - testfn = test.support.TESTFN - self.addCleanup(test.support.unlink, testfn) + testfn = TESTFN + self.addCleanup(unlink, testfn) proc = self.Process(target=self._test_stderr_flush, args=(testfn,)) proc.start() proc.join() @@ -655,8 +665,8 @@ def test_sys_exit(self): if self.TYPE == "threads": self.skipTest("test not appropriate for {}".format(self.TYPE)) - testfn = test.support.TESTFN - self.addCleanup(test.support.unlink, testfn) + testfn = TESTFN + self.addCleanup(unlink, testfn) for reason in ( [1, 2, 3], @@ -764,7 +774,7 @@ def test_parent_process(self): p = self.Process(target=self._test_create_grandchild_process, args=(wconn,)) p.start() - if not rconn.poll(timeout=support.LONG_TIMEOUT): + if not rconn.poll(timeout=test.support.LONG_TIMEOUT): raise AssertionError("Could not communicate with child process") parent_process_status = rconn.recv() self.assertEqual(parent_process_status, "alive") @@ -772,7 +782,7 @@ def test_parent_process(self): p.terminate() p.join() - if not rconn.poll(timeout=support.LONG_TIMEOUT): + if not rconn.poll(timeout=test.support.LONG_TIMEOUT): raise AssertionError("Could not communicate with child process") parent_process_status = rconn.recv() self.assertEqual(parent_process_status, "not alive") @@ -788,7 +798,7 @@ def _test_report_parent_status(cls, wconn): from multiprocessing.process import parent_process wconn.send("alive" if parent_process().is_alive() else "not alive") - parent_process().join(timeout=support.SHORT_TIMEOUT) + parent_process().join(timeout=test.support.SHORT_TIMEOUT) wconn.send("alive" if parent_process().is_alive() else "not alive") @unittest.skip("bug filed PE-40908") @@ -1075,7 +1085,7 @@ def test_lose_target_ref(self): @classmethod def _test_child_fd_inflation(self, evt, q): - q.put(test.support.fd_count()) + q.put(fd_count()) evt.wait() @unittest.skip("DRAGON: Semlock not implemented") diff --git a/examples/multiprocessing/unittests/test_queue.py b/examples/multiprocessing/unittests/test_queue.py index a4d6f16..4199896 100644 --- a/examples/multiprocessing/unittests/test_queue.py +++ b/examples/multiprocessing/unittests/test_queue.py @@ -6,7 +6,14 @@ import test.support from test.support import hashlib_helper -from test import support +try: + from test.support.os_helper import temp_cwd + from test.support.import_helper import DirsOnSysPath +except ImportError: + #location prior to Python 3.10 + from test.support import temp_cwd + from test.support import DirsOnSysPath + import dragon # DRAGON import before multiprocessing @@ -250,7 +257,7 @@ def test_task_done(self): close_queue(queue) def test_no_import_lock_contention(self): - with test.support.temp_cwd(): + with temp_cwd(): module_name = "imported_by_an_imported_module" with open(module_name + ".py", "w") as f: f.write( @@ -265,7 +272,7 @@ def test_no_import_lock_contention(self): """ ) - with test.support.DirsOnSysPath(os.getcwd()): + with DirsOnSysPath(os.getcwd()): try: __import__(module_name) except pyqueue.Empty: @@ -295,7 +302,7 @@ def __reduce__(self): q = self.Queue() q.put(NotSerializable()) q.put(True) - self.assertTrue(q.get(timeout=support.SHORT_TIMEOUT)) + self.assertTrue(q.get(timeout=test.support.SHORT_TIMEOUT)) close_queue(q) with test.support.captured_stderr(): @@ -310,7 +317,7 @@ def __reduce__(self): # qsize is not available on all platform as it # relies on sem_getvalue pass - self.assertTrue(q.get(timeout=support.SHORT_TIMEOUT)) + self.assertTrue(q.get(timeout=test.support.SHORT_TIMEOUT)) # Check that the size of the queue is correct self.assertTrue(q.empty()) @@ -349,7 +356,7 @@ def _on_queue_feeder_error(e, obj): # Verify that q is still functioning correctly q.put(True) - self.assertTrue(q.get(timeout=support.SHORT_TIMEOUT)) + self.assertTrue(q.get(timeout=test.support.SHORT_TIMEOUT)) # Assert that the serialization and the hook have been called correctly self.assertTrue(not_serializable_obj.reduce_was_called) @@ -570,7 +577,7 @@ def test_task_done(self): close_queue(queue) def test_no_import_lock_contention(self): - with test.support.temp_cwd(): + with temp_cwd(): module_name = "imported_by_an_imported_module" with open(module_name + ".py", "w") as f: f.write( @@ -585,7 +592,7 @@ def test_no_import_lock_contention(self): """ ) - with test.support.DirsOnSysPath(os.getcwd()): + with DirsOnSysPath(os.getcwd()): try: __import__(module_name) except pyqueue.Empty: @@ -615,7 +622,7 @@ def __reduce__(self): q = self.Queue() q.put(NotSerializable()) q.put(True) - self.assertTrue(q.get(timeout=support.SHORT_TIMEOUT)) + self.assertTrue(q.get(timeout=test.support.SHORT_TIMEOUT)) close_queue(q) with test.support.captured_stderr(): @@ -630,7 +637,7 @@ def __reduce__(self): # qsize is not available on all platform as it # relies on sem_getvalue pass - self.assertTrue(q.get(timeout=support.SHORT_TIMEOUT)) + self.assertTrue(q.get(timeout=test.support.SHORT_TIMEOUT)) # Check that the size of the queue is correct self.assertTrue(q.empty()) close_queue(q) @@ -668,7 +675,7 @@ def _on_queue_feeder_error(e, obj): # Verify that q is still functioning correctly q.put(True) - self.assertTrue(q.get(timeout=support.SHORT_TIMEOUT)) + self.assertTrue(q.get(timeout=test.support.SHORT_TIMEOUT)) # Assert that the serialization and the hook have been called correctly self.assertTrue(not_serializable_obj.reduce_was_called) @@ -888,7 +895,7 @@ def test_task_done(self): close_queue(queue) def test_no_import_lock_contention(self): - with test.support.temp_cwd(): + with temp_cwd(): module_name = "imported_by_an_imported_module" with open(module_name + ".py", "w") as f: f.write( @@ -903,7 +910,7 @@ def test_no_import_lock_contention(self): """ ) - with test.support.DirsOnSysPath(os.getcwd()): + with DirsOnSysPath(os.getcwd()): try: __import__(module_name) except pyqueue.Empty: @@ -933,7 +940,7 @@ def __reduce__(self): q = self.Queue() q.put(NotSerializable()) q.put(True) - self.assertTrue(q.get(timeout=support.SHORT_TIMEOUT)) + self.assertTrue(q.get(timeout=test.support.SHORT_TIMEOUT)) close_queue(q) with test.support.captured_stderr(): @@ -948,7 +955,7 @@ def __reduce__(self): # qsize is not available on all platform as it # relies on sem_getvalue pass - self.assertTrue(q.get(timeout=support.SHORT_TIMEOUT)) + self.assertTrue(q.get(timeout=test.support.SHORT_TIMEOUT)) # Check that the size of the queue is correct self.assertTrue(q.empty()) close_queue(q) @@ -986,7 +993,7 @@ def _on_queue_feeder_error(e, obj): # Verify that q is still functioning correctly q.put(True) - self.assertTrue(q.get(timeout=support.SHORT_TIMEOUT)) + self.assertTrue(q.get(timeout=test.support.SHORT_TIMEOUT)) # Assert that the serialization and the hook have been called correctly self.assertTrue(not_serializable_obj.reduce_was_called) diff --git a/examples/multiprocessing/unittests/test_shared_memory.py b/examples/multiprocessing/unittests/test_shared_memory.py index 721fbf5..258b7d3 100644 --- a/examples/multiprocessing/unittests/test_shared_memory.py +++ b/examples/multiprocessing/unittests/test_shared_memory.py @@ -13,8 +13,6 @@ import gc import test.support -from test import support -import test.support.script_helper from test.support import hashlib_helper import dragon # DRAGON import before multiprocessing diff --git a/examples/workflows/ai-in-the-loop/README.md b/examples/workflows/ai-in-the-loop/README.md index 38f6bd3..caf64c3 100755 --- a/examples/workflows/ai-in-the-loop/README.md +++ b/examples/workflows/ai-in-the-loop/README.md @@ -1,4 +1,4 @@ -# AI-in-the-loop workflow with Dragon +# AI-in-the-loop workflow with Dragon ## Introduction This is an example of how Dragon can be used to execute an AI-in-the-loop workflow. Inspiration for this demo comes from the NERSC-10 Workflow Archetypes White Paper. This workflow most closely resembles the workflow scenario given as part of archetype four. In this example we use a small model implemented in PyTorch to compute an approximation to sin(x). In parallel to doing the inference with the model, we launch `sim-cheap` on four ranks. This MPI job computes the taylor approximation to sin(x) and compares this with the output of the model. If the difference is less than 0.05 we consider the model's approximation to be sufficiently accurate and print out the result with the exact result. If the difference is larger than 0.05 we consider this a failure and re-train the model on a new set of data. To generate this data we launch `sim-expensive`. This MPI job is launched on eight ranks-per-node and each rank generates 32 data points of the form (x, sin(x)) where x is sampled uniformly in [-pi, pi). This data is aggregated into a PyTorch tensor and then used to train the model. We then re-evaluate the re-trained model and decide if we need to re-train again or if the estimate is sufficiently accurate. We continue this loop until we've had five successes. @@ -8,29 +8,29 @@ Below is a diagram of the main computational loop. ⬇ Parallel Execution ⬅ Re-train the AI Model ⬇ ⬇ - Infer Calculate -value from comparison + Infer Calculate +value from comparison AI Model using four ⬆ rank MPI job ⬇ ⬇ Parallel Execution ⬇ Is the inferred No Launch expensive MPI process - value within ⮕ to generate new data + value within ⮕ to generate new data tolerance? ⬇ Yes -``` +``` ## Usage -`ai-in-the-loop.py` - This is the main file. It contains functions for launching both MPI executables and parsing the results as well as imports functions defined in `model.py` and coordinates the model inference and training with the MPI jobs. +`ai-in-the-loop.py` - This is the main file. It contains functions for launching both MPI executables and parsing the results as well as imports functions defined in `model.py` and coordinates the model inference and training with the MPI jobs. -`model.py` - This file defines the model and provides some functions for model training and inference. +`model.py` - This file defines the model and provides some functions for model training and inference. `sim-expensive.c` - This contains what we are considering the expensive MPI job. It computes (x, sin(x)) data points that are used to train the model. -`sim-cheap.c` - This is the cheap approximation. It computes the Taylor approximation of sin(x). +`sim-cheap.c` - This is the cheap approximation. It computes the Taylor approximation of sin(x). `Makefile` - Used to build the two MPI applications. @@ -40,12 +40,13 @@ value from comparison usage: dragon ai-in-the-loop.py ``` -## Installation +## Installation -After installing dragon, the only other dependency is on PyTorch. The PyTorch version and corresponding pip command can be found here (https://pytorch.org/get-started/locally/). +After installing dragon, the only other dependency is on PyTorch and SciPy. The PyTorch version and corresponding pip command can be found here (https://pytorch.org/get-started/locally/). ``` -> pip install torch torchvision torchaudio +> pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu +> pip install scipy ``` ### Description of the system used @@ -57,12 +58,12 @@ For this example, HPE Cray Hotlum nodes were used. Each node has AMD EPYC 7763 6 ### Multi-node -The default parameters are for 16 nodes but this example has been run up to 64 nodes with 8 ranks-per-node. +The default parameters are for 16 nodes but this example has been run up to 64 nodes with 8 ranks-per-node. ``` > make gcc -g -pedantic -Wall -I /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/include -L /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/lib -c -o sim-cheap.o sim-cheap.c gcc -g -pedantic -Wall -I /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/include -L /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/lib sim-cheap.o -o sim-cheap -lm -L /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/lib -lmpich -gcc -g -pedantic -Wall -I /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/include -L /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/lib -c -o sim-expensive.o +gcc -g -pedantic -Wall -I /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/include -L /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/lib -c -o sim-expensive.o gcc -g -pedantic -Wall -I /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/include -L /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/lib sim-expensive.o -o sim-expensive -lm -L /opt/cray/pe/mpich/8.1.26/ofi/gnu/9.1/lib -lmpich > salloc --nodes=16 --exclusive > dragon ai-in-the-loop.py diff --git a/examples/workflows/parsl/README.md b/examples/workflows/parsl/README.md index dd9126e..811ae30 100755 --- a/examples/workflows/parsl/README.md +++ b/examples/workflows/parsl/README.md @@ -1,13 +1,13 @@ -# AI-in-the-loop workflow with Dragon +# Parsl MPI application with Dragon ## Introduction This shows an example workflow using Parsl with Dragon. In this example we use a Dragon implementation of the `@mpi_app` decorator and the `DragonMPIExecutor`. The executor expects five arguments to be returned from the decorated function: the executable, the directory containing the executable, the policy for process placement, the number of MPI processes to launch, and the arguments to pass to the executable. The arguments are expected to be returned in this order. The executor returns a future thats result is a dictionary containing a connection to stdin and stdout to rank 0. -In this example we compute the factorial of the largest MPI rank. We multiply this factorial by a scale factor that is sent using the stdin connection and add a bias to the scaled factorial that is passed to the MPI app via the args. The result is printed out by rank 0 and received by the head process from the stdout connection. This result is printed out and compared to the expected exact solution. +In this example we compute the factorial of the largest MPI rank. We multiply this factorial by a scale factor that is sent using the stdin connection and add a bias to the scaled factorial that is passed to the MPI app via the args. The result is printed out by rank 0 and received by the head process from the stdout connection. This result is printed out and compared to the expected exact solution. ## Usage -`parsl_mpi_app_demo.py` - This is the main file. It contains the `@mpi_app` decorated function with the required return arguments for that function. It also has the two functions used for sending data to and receiving data from stdin and stdout, respectively. +`parsl_mpi_app_demo.py` - This is the main file. It contains the `@mpi_app` decorated function with the required return arguments for that function. It also has the two functions used for sending data to and receiving data from stdin and stdout, respectively. `factorial.c` - This contains what the MPI application that computes the factorial, scales it by the scale factor received from the stdin connection, and then adds the bias from the args to it. @@ -17,12 +17,12 @@ In this example we compute the factorial of the largest MPI rank. We multiply th usage: dragon parsl_mpi_app_demo.py ``` -## Installation +## Installation After installing dragon, the only other dependency is on Parsl. The command to install Parsl is ``` -> pip install parsl +> pip install parsl ``` ## Example Output @@ -35,5 +35,5 @@ gcc -g -pedantic -Wall -I /opt/cray/pe/mpich/8.1.27/ofi/gnu/9.1/include -L /opt gcc -g -pedantic -Wall -I /opt/cray/pe/mpich/8.1.27/ofi/gnu/9.1/include -L /opt/cray/pe/mpich/8.1.27/ofi/gnu/9.1/lib factorial.o -o factorial -lm -L /opt/cray/pe/mpich/8.1.27/ofi/gnu/9.1/lib -lmpich > salloc --nodes=2 --exclusive >$dragon dragon parsl_mpi_app_demo.py -mpi computation: 0.000100 * 362880.000000 + 10.000000 = 46.288000 , exact = 46.288000000000004 +mpi computation: 0.000100 * 362880.000000 + 10.000000 = 46.288000 , exact = 46.288000000000004 ``` diff --git a/src/dragon/__init__.py b/src/dragon/__init__.py index 1caf508..94b8fbf 100644 --- a/src/dragon/__init__.py +++ b/src/dragon/__init__.py @@ -38,7 +38,6 @@ class Foo(multiprocessing.synchronize.Event): import os from distutils.util import strtobool - def _patch_multiprocessing(): # Set DRAGON_PATCH_MP so multiprocessing is automatically patched when # imported in a subprocess. @@ -47,6 +46,16 @@ def _patch_multiprocessing(): from .mpbridge.monkeypatching import patch_multiprocessing patch_multiprocessing() +def _patch_torch(): + + from .ai.torch.monkeypatching import patch_torch + patch_torch() + + from .ai.torch.dataloader_monkeypatch import patch_mpdataloader_torch + patch_mpdataloader_torch() if bool(strtobool(os.environ.get("DRAGON_PATCH_MP", "False"))): _patch_multiprocessing() + +if bool(strtobool(os.environ.get("DRAGON_PATCH_TORCH", "False"))): + _patch_torch() diff --git a/src/dragon/ai/__init__.py b/src/dragon/ai/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/dragon/ai/torch/__init__.py b/src/dragon/ai/torch/__init__.py new file mode 100644 index 0000000..9124c50 --- /dev/null +++ b/src/dragon/ai/torch/__init__.py @@ -0,0 +1,6 @@ + +from .monkeypatching import patch_torch +patch_torch() + +from .dataloader_monkeypatch import patch_mpdataloader_torch +patch_mpdataloader_torch() diff --git a/src/dragon/ai/torch/dataloader_monkeypatch.py b/src/dragon/ai/torch/dataloader_monkeypatch.py new file mode 100644 index 0000000..1dea872 --- /dev/null +++ b/src/dragon/ai/torch/dataloader_monkeypatch.py @@ -0,0 +1,43 @@ +"""A class to monkeypatch PyTorch Multiprocessing DataLoader that does not work with Dragon. +""" +import dragon +import os +from dragon.infrastructure.policy import Policy +import dragon.ai.torch +import torch +import socket + + +class _DragonMultiProcessingDataLoaderIter(torch.utils.data.dataloader._MultiProcessingDataLoaderIter): + """Class managing the Dragon Multiprocessing DataLoader monkeypatching functionality. + + The processes are run on the same node using policy. The DragonLoader is monkeypatched with DragonMultiprocessingDataLoader. The original functionality is saved and switched back in once PyTorch is imported. + """ + + def __init__(self, loader, *args, **kwargs): + with Policy(placement=Policy.Placement.HOST_NAME, host_name=socket.gethostname()): + super().__init__(loader, **kwargs) + + +class DataloaderIterMonkeypatcher: + def __init__(self): + """Save original torch utils DataLoader MultiprocessingDataLoader""" + self._DataloaderIter = torch.utils.data.dataloader._MultiProcessingDataLoaderIter + + def switch_out(self) -> None: + """Switch out Multiprocessing Dataloader for Dragon Multiprocessing DataLoader""" + torch.utils.data.dataloader._MultiProcessingDataLoaderIter = _DragonMultiProcessingDataLoaderIter + + def switch_in(self) -> None: + """Switch back in Dragon Multiprcocessing DataLoader with original""" + torch.utils.data.dataloader._MultiProcessingDataLoaderIter = self._DataloaderIter + + +original_dataloader_iter = DataloaderIterMonkeypatcher() + + +def patch_mpdataloader_torch(): + # set env variable so that subprocesses get patched torch + os.environ["DRAGON_PATCH_TORCH"] = str(True) + # patch torch multiprocessing by switching out the Multiprocessing Dataloader for Dragon Multiprocessing DataLoader + original_dataloader_iter.switch_out() diff --git a/src/dragon/ai/torch/dictdataset.py b/src/dragon/ai/torch/dictdataset.py new file mode 100644 index 0000000..0818732 --- /dev/null +++ b/src/dragon/ai/torch/dictdataset.py @@ -0,0 +1,58 @@ +from dragon.data.distdictionary.dragon_dict import DragonDict +from torch.utils.data import Dataset +from collections.abc import Iterable +from typing import Any +label=Any +data=Any +DataPair = tuple[data, label] +import warnings + +class DragonDataset(Dataset): + """ + This is a PyTorch dataset that utilizes the dragon distributed dictionary to store the training data and labels. It takes either an iterable for the data or an existing dragon distributed dictionary with a list of its keys. The PyTorch Dataloader requires three functions to be supported: `__getitem__`, `__len__`, and `__init__`. For use with an arbitrary iterable, a `stop()` function is also provided that closes the dragon distributed dictionary. If the user provides a dictionary, the user is expected to manage the dictionary and close it directly. + + :param Dataset: Base PyTorch Dataset + :type Dataset: PyTorch Dataset + """ + def __init__(self, dataset: Iterable[DataPair] or DragonDict, *, dataset_keys=None, dragon_dict_args=None): + if dragon_dict_args is None and dataset_keys is not None: + # dictionary is managed elsewhere + self._manage_dict = False + self.dict = dataset + self.keys = dataset_keys + else: + # dataset manages dictionary + self._manage_dict = True + self.dict, self.keys = self._build_dict(dataset, dragon_dict_args) + + def __len__(self): + return len(self.dict) + + def __getitem__(self, idx): + """ Gets a data and label pair from the distributed dictionary based on an idx in [0, len(self.dict)). It retrieves the key self.keys[idx]. + + :param idx: A randomly generated index to the list of keys + :type idx: int + :return: Tuple of the data and label with key self.keys[idx] + :rtype: tuple + """ + data, label = self.dict[self.keys[idx]] + return data, label + + def _build_dict(self, dataset: Iterable[DataPair], dragon_dict_args): + data_dict = DragonDict(dragon_dict_args.managers_per_node, dragon_dict_args.n_nodes, dragon_dict_args.total_mem) + + # iterate through the dataset and put each sample in the dictionary + # this can be done with multiple workers if the dataset is large + keys= [0]*len(dataset) + for i, (data, label) in enumerate(dataset): + keys[i] = i + data_dict[i] = (data, label) + + return data_dict, keys + + def stop(self): + if self._manage_dict: + self.dict.stop() + else: + warnings.warn('Dragon dataset dictionary is user-managed. DragonDataset.stop() was ignored.') diff --git a/src/dragon/ai/torch/monkeypatching.py b/src/dragon/ai/torch/monkeypatching.py new file mode 100644 index 0000000..51934a2 --- /dev/null +++ b/src/dragon/ai/torch/monkeypatching.py @@ -0,0 +1,45 @@ +"""A class to monkeypatch PyTorch Multiprocessing reductions that do not work with Dragon. + +PyTorch's multiprocessing introduces specialized reductions for PyTorch data objects like tensors. These reductions utilize shared memory and duplicate file-descriptors. This is incompatible with Dragon. This monkeypatch disables the ability to register these reductions with the ForkingPickler, imports torch, and then returns the ability to register reductions with the ForkingPickler. +""" +import dragon +import os +from multiprocessing.reduction import ForkingPickler + +def dragon_fp_register(cls, type, reduce): + return + +class FPregisterMonkeypatcher: + """Class managing the ForkingPickler monkeypatching functionality. + + We temporarily replace the ForkingPickler.register function to be a no-op. We save the original ForkingPickler.register function so that we can switch it back in once PyTorch has been imported. + """ + + def __init__(self): + """Save original ForkingPickler.register function""" + self.fp_register = ForkingPickler.register + + def switch_out(self) -> None: + """Switch out ForkingPickler.register function for no-op""" + ForkingPickler.register = classmethod(dragon_fp_register) + + def switch_in(self) -> None: + """Switch back in ForkingPickler.register function with original""" + ForkingPickler.register = self.fp_register + +original_fp_register = FPregisterMonkeypatcher() + +def patch_torch(): + """ This is called when dragon.ai.torch is imported. This needs to be done before torch is imported. The environment variable ensures that all subprocesses also patch torch. + + The mechanics of the patch are as follows: + 1. Make ForkingPickler.register a no-op so that reductions registered with the ForkingPickler are ignored. This function is used in torch.multiprocessing.reductions.init_reductions to register the specialized reductions. + 2. Import PyTorch while the ForkingPickler.register function is a no-op + 3. Switch back in the ForkingPickler.register's original functionality so that downstream functions are actually registered. + """ + # set env variable so that subprocesses get patched torch + os.environ["DRAGON_PATCH_TORCH"] = str(True) + # patch torch multiprocessing by making ForkingPickler.register a no-op, importing torch, and then returning the ForkingPickler.register to its original functionality + original_fp_register.switch_out() + import torch + original_fp_register.switch_in() \ No newline at end of file diff --git a/src/dragon/cli/__init__.py b/src/dragon/cli/__init__.py index baee35c..0624309 100644 --- a/src/dragon/cli/__init__.py +++ b/src/dragon/cli/__init__.py @@ -23,7 +23,7 @@ def _from_text(text): PROCNAME_NETWORK_CONFIG = 'dragon-network-config' PROCNAME_NETWORK_CONFIG_LAUNCH_HELPER = 'dragon-network-config-launch-helper' PROCNAME_NETWORK_CONFIG_SHUTDOWN_HELPER = 'dragon-network-config-shutdown-helper' -PROCNAME_NETWORK_IFADDRS='dragon-network-ifaddrs' +PROCNAME_NETWORK_IFADDRS = 'dragon-network-ifaddrs' # TODO Refactor frontend entry point. See ../__main__.py and # TODO ../launcher/launch_selector.py. diff --git a/src/dragon/dtypes_inc.pxd b/src/dragon/dtypes_inc.pxd index b7239c2..be58526 100644 --- a/src/dragon/dtypes_inc.pxd +++ b/src/dragon/dtypes_inc.pxd @@ -496,9 +496,13 @@ cdef extern from "": ctypedef struct dragonFIFOLiteLock_t: pass - ctypedef struct dragonLock_t: + ctypedef struct dragonLock_u: pass + ctypedef struct dragonLock_t: + dragonLockKind_t kind + dragonLock_u ptr + size_t dragon_lock_size(dragonLockKind_t kind) dragonError_t dragon_lock_init(dragonLock_t * lock, void * ptr, dragonLockKind_t lock_kind) dragonError_t dragon_lock_attach(dragonLock_t * lock, void * ptr) @@ -572,3 +576,58 @@ cdef extern from "dragon/perf.h": dragonError_t dragon_chperf_kernel_append_op(int kernel_idx, dragonChPerfOpcode_t op_code, int dst_ch_idx, size_t size_in_bytes, double timeout_in_sec) dragonError_t dragon_chperf_kernel_run(int kernel_idx, double *run_time) +cdef extern from "dragon/fli.h": + + ctypedef struct dragonFLIAttr_t: + pass + + ctypedef struct dragonFLIDescr_t: + pass + + ctypedef struct dragonFLISerial_t: + size_t len + uint8_t * data + + ctypedef struct dragonFLISendHandleDescr_t: + pass + + ctypedef struct dragonFLIRecvHandleDescr_t: + pass + + dragonChannelDescr_t* STREAM_CHANNEL_IS_MAIN_FOR_1_1_CONNECTION + + dragonError_t dragon_fli_create(dragonFLIDescr_t* adapter, dragonChannelDescr_t* main_ch, + dragonChannelDescr_t* mgr_ch, dragonMemoryPoolDescr_t* pool, + const dragonULInt num_strm_chs, dragonChannelDescr_t** strm_channels, + const bool user_buffered_protocol, dragonFLIAttr_t* attrs) nogil + dragonError_t dragon_fli_destroy(dragonFLIDescr_t* adapter) nogil + dragonError_t dragon_fli_serialize(const dragonFLIDescr_t* adapter, dragonFLISerial_t* serial) nogil + dragonError_t dragon_fli_serial_free(dragonFLISerial_t* serial) nogil + dragonError_t dragon_fli_attach(const dragonFLISerial_t* serial, const dragonMemoryPoolDescr_t* pool, + dragonFLIDescr_t* adapter) nogil + dragonError_t dragon_fli_detach(dragonFLIDescr_t* adapter) nogil + dragonError_t dragon_fli_open_send_handle(const dragonFLIDescr_t* adapter, dragonFLISendHandleDescr_t* send_handle, + dragonChannelDescr_t* strm_ch, const timespec_t* timeout) nogil + dragonError_t dragon_fli_close_send_handle(dragonFLISendHandleDescr_t* send_handle, + const timespec_t* timeout) nogil + dragonError_t dragon_fli_open_recv_handle(const dragonFLIDescr_t* adapter, dragonFLIRecvHandleDescr_t* recv_handle, + dragonChannelDescr_t* strm_ch, const timespec_t* timeout) nogil + dragonError_t dragon_fli_close_recv_handle(dragonFLIRecvHandleDescr_t* recv_handle, const timespec_t* timeout) nogil + dragonError_t dragon_fli_create_writable_fd(dragonFLISendHandleDescr_t* send_handle, int* fd_ptr, const bool buffer, + size_t chunk_size, const uint64_t arg, const timespec_t* timeout) nogil + dragonError_t dragon_fli_finalize_writable_fd(dragonFLISendHandleDescr_t* send_handle) nogil + dragonError_t dragon_fli_create_readable_fd(dragonFLIRecvHandleDescr_t* recv_handle, int* fd_ptr, + const timespec_t* timeout) nogil + dragonError_t dragon_fli_finalize_readable_fd(dragonFLIRecvHandleDescr_t* recv_handle) nogil + dragonError_t dragon_fli_send_bytes(dragonFLISendHandleDescr_t* send_handle, size_t num_bytes, + uint8_t* bytes, uint64_t arg, const bool buffer, const timespec_t* timeout) nogil + dragonError_t dragon_fli_send_mem(dragonFLISendHandleDescr_t* send_handle, dragonMemoryDescr_t* mem, + uint64_t arg, const timespec_t* timeout) nogil + dragonError_t dragon_fli_recv_bytes(dragonFLIRecvHandleDescr_t* recv_handle, size_t requested_size, + size_t* received_size, uint8_t** bytes, uint64_t* arg, + const timespec_t* timeout) nogil + dragonError_t dragon_fli_recv_bytes_into(dragonFLIRecvHandleDescr_t* recv_handle, size_t requested_size, + size_t* received_size, uint8_t* bytes, uint64_t* arg, + const timespec_t* timeout) nogil + dragonError_t dragon_fli_recv_mem(dragonFLIRecvHandleDescr_t* recv_handle, dragonMemoryDescr_t* mem, + uint64_t* arg, const timespec_t* timeout) nogil diff --git a/src/dragon/globalservices/policy_eval.py b/src/dragon/globalservices/policy_eval.py index 0563049..dbcd559 100644 --- a/src/dragon/globalservices/policy_eval.py +++ b/src/dragon/globalservices/policy_eval.py @@ -8,8 +8,9 @@ class ResourceLayout: h_uid : int host_name : str numa_node : int # TODO - core : [int] # List of acceptable CPU cores the layout can be applied to - accelerator : int # TODO + cpu_core : [int] # List of acceptable CPU cores the layout can be applied to + gpu_core : [int] + accelerator_env : str # Possible TODO: Affinity offset or placement offset, to "start counting from here", just make sure it starts at a certain place @@ -50,7 +51,7 @@ def _find_next_open(self, cur_idx): return node - def _add_layout(self, node, affinity, layouts): + def _add_layout(self, node, cpu_affinity, gpu_affinity, env_str, layouts): """ Short helper function to auto-increment num_policies value on nodes @@ -60,7 +61,9 @@ def _add_layout(self, node, affinity, layouts): """ # NOTE: Numa node and accelerator are placeholders - layouts.append( ResourceLayout(node.h_uid, node.host_name, 1, affinity, 0) ) + numa_node = 0 + layouts.append( ResourceLayout(node.h_uid, node.host_name, numa_node, + cpu_affinity, gpu_affinity, env_str) ) node.num_policies += 1 def _node_by_id(self, host_id) -> NodeDescriptor: @@ -140,31 +143,28 @@ def _get_node(self, p : Policy) -> NodeDescriptor: return node - def _get_affinity(self, p : Policy, node : NodeDescriptor) -> list[int]: + def _get_cpu_affinity(self, p : Policy, node : NodeDescriptor) -> list[int]: """ Generate a list of available devices the policy can be applied to for the given Node """ - affinity = p.affinity - if affinity == Policy.Affinity.DEFAULT: - affinity = Policy.Affinity.ANY + if p.cpu_affinity: # List not empty, assume SPECIFIC affinity + affinity = [x for x in node.cpu_devices if x in p.cpu_affinity] + return affinity # This covers both "ANY" and "SPECIFIC" if a specific list is given - device = p.device - if device == Policy.Device.DEFAULT: - device = Policy.Device.CPU + if p.affinity == Policy.Affinity.ANY: + return node.cpu_devices - if device == Policy.Device.CPU: - if p.specific_affinity: # List not empty, assume SPECIFIC affinity - affinity = [x for x in node.cpu_devices if x in p.specific_affinity] - return affinity # This covers both "ANY" and "SPECIFIC" if a specific list is given + return [] + + def _get_gpu_affinity(self, p : Policy, node : NodeDescriptor) -> list[int]: - if affinity == Policy.Affinity.ANY: - return node.cpu_devices + if p.gpu_affinity: + affinity = [x for x in node.accelerators.device_list if x in p.gpu_affinity] + return affinity - if device == Policy.Device.GPU: - raise RuntimeError("GPU Affinity Not Implemented") - # TODO: Affinity ROUNDROBIN and BLOCK. These are more load balancing options than not, so tracking usage needs to be figured out - # TODO: GPU Affinity. NodeDescriptor needs to pull a list of available accelerators (through an os.exec grep command maybe?) + if p.affinity == Policy.Affinity.ANY and node.accelerators is not None: + return node.accelerators.device_list return [] @@ -180,8 +180,12 @@ def evaluate(self, policies : list[Policy]=None) -> list[ResourceLayout]: # Merge incoming policies against the self.default_policy so any DEFAULT enums get replaced with the default policy option p = self.merge(self.default_policy, p) node = self._get_node(p) # Get a node based on policy (if requesting specific nodes, may raise exception) - affinity = self._get_affinity(p, node) # Get affinity based on policy - self._add_layout(node, affinity, layouts) + cpu_affinity = self._get_cpu_affinity(p, node) # Get affinity based on policy + gpu_affinity = self._get_gpu_affinity(p, node) + env_str = "" # Environment string for setting accelerator affinity + if gpu_affinity: + env_str = node.accelerators.env_str + self._add_layout(node, cpu_affinity, gpu_affinity, env_str, layouts) return layouts diff --git a/src/dragon/globalservices/process.py b/src/dragon/globalservices/process.py index 30c9afb..120abfa 100644 --- a/src/dragon/globalservices/process.py +++ b/src/dragon/globalservices/process.py @@ -21,6 +21,7 @@ from ..infrastructure import facts as dfacts from ..infrastructure import connection as dconn from ..utils import B64 +from ..infrastructure.policy import Policy import os log = logging.getLogger('process_api') @@ -180,7 +181,7 @@ def _create_stdio_connections(the_desc): def get_create_message(exe, run_dir, args, env, user_name='', options=None, stdin=None, stdout=None, stderr=None, group=None, - user=None, umask=- 1, pipesize=- 1, pmi_required=False): + user=None, umask=- 1, pipesize=- 1, pmi_required=False, policy=None): """Return a GSProcessCreate object. :param exe: executable to run @@ -197,6 +198,7 @@ def get_create_message(exe, run_dir, args, env, user_name='', options=None, :param umask: Not used :param pipesize: Set the channel capacity. Default = -1. :param pmi_required: This process is part of a Dragon managed MPI/PMI application group. + :param policy: If a policy other than the global default is to be used for this process. :return: GSProcessCreate message object """ @@ -206,6 +208,12 @@ def get_create_message(exe, run_dir, args, env, user_name='', options=None, if env is not None: the_env.update(env) # layer on the user supplied environment. + # Ensure that the executable is a string and not a bytes object + try: + exe = exe.decode() + except (UnicodeDecodeError, AttributeError): + pass + log.debug('creating GSProcessCreate') return dmsg.GSProcessCreate(tag=das.next_tag(), p_uid=this_process.my_puid, @@ -214,7 +222,8 @@ def get_create_message(exe, run_dir, args, env, user_name='', options=None, rundir=run_dir, user_name=user_name, options=options, stdin=stdin, stdout=stdout, stderr=stderr, group=group, user=user, umask=umask, - pipesize=pipesize, pmi_required=pmi_required) + pipesize=pipesize, pmi_required=pmi_required, + policy=policy) def get_create_message_with_argdata(exe, run_dir, args, env, argdata=None, user_name='', options=None, @@ -255,6 +264,12 @@ def get_create_message_with_argdata(exe, run_dir, args, env, argdata=None, user_ if env is not None: the_env.update(env) # layer on the user supplied environment. + # Ensure that the executable is a string and not a bytes object + try: + exe = exe.decode() + except (UnicodeDecodeError, AttributeError): + pass + log.debug('creating GSProcessCreate') if argdata is None: @@ -290,7 +305,7 @@ def get_create_message_with_argdata(exe, run_dir, args, env, argdata=None, user_ def create(exe, run_dir, args, env, user_name='', options=None, soft=False, stdin=None, stdout=None, stderr=None, group=None, - user=None, umask=- 1, pipesize=- 1, pmi_required=False): + user=None, umask=- 1, pipesize=- 1, pmi_required=False, policy=None): """Asks Global Services to create a new process. :param exe: executable to run @@ -308,9 +323,13 @@ def create(exe, run_dir, args, env, user_name='', options=None, soft=False, :param umask: Not used :param pipesize: Set the channel capacity. Default = -1. :param pmi_required: This process is part of a Dragon managed MPI/PMI application group. + :param policy: If a policy other than the global default is to be used for this process. :return: ProcessDescriptor object """ + if policy is None: + policy = Policy.global_policy() + # When a new process is requested we want to check to see if we are supposed to # redirect output from new processes to the parent process (this process). If so, we # set that up here and then remove that indicator from the env vars. This @@ -343,6 +362,7 @@ def create(exe, run_dir, args, env, user_name='', options=None, soft=False, umask=umask, pipesize=pipesize, pmi_required=pmi_required, + policy=policy, ) reply_msg = das.gs_request(req_msg) diff --git a/src/dragon/globalservices/process_int.py b/src/dragon/globalservices/process_int.py index 6bd1949..9167a44 100644 --- a/src/dragon/globalservices/process_int.py +++ b/src/dragon/globalservices/process_int.py @@ -16,6 +16,7 @@ from ..infrastructure import connection as dconn from ..infrastructure import parameters as dp from ..infrastructure import util as dutil +from ..infrastructure import policy as dpolicy from ..utils import B64 LOG = logging.getLogger('process:') @@ -136,7 +137,8 @@ def _mk_sh_proc_create(self, the_tag, which_node): stdin_msg=stdin_msg, stdout_msg=stdout_msg, stderr_msg=stderr_msg, - pmi_info=self.request._pmi_info) #pylint: disable=protected-access + pmi_info=self.request._pmi_info, + layout=self.request.layout) #pylint: disable=protected-access def mk_sh_proc_kill(self, the_tag, the_sig=signal.SIGKILL): return dmsg.SHProcessKill(tag=the_tag, @@ -190,6 +192,10 @@ def construct(cls, server, msg, reply_channel, head, send_msg=True, belongs_to_g if not msg.user_name: msg.user_name = auto_name + # If a policy was passed through but has not been evaluated into a layout, do so now + if msg.layout is None and msg.policy is not None: + msg.layout = server.policy_eval.evaluate([msg.policy])[0] + which_node = server.choose_shepherd(msg) context = cls(server=server, request=msg, reply_channel=reply_channel, diff --git a/src/dragon/infrastructure/facts.py b/src/dragon/infrastructure/facts.py index f8eeaae..f022181 100644 --- a/src/dragon/infrastructure/facts.py +++ b/src/dragon/infrastructure/facts.py @@ -1,5 +1,6 @@ """Specified constants and names used in the Dragon runtime. """ +import os import shlex import enum import sys @@ -10,7 +11,14 @@ # Number of gateway channels per node DRAGON_DEFAULT_NUM_GW_CHANNELS_PER_NODE = 0 DRAGON_OVERLAY_DEFAULT_NUM_GW_CHANNELS_PER_NODE = 1 + +# These three constants are repeated within C code. Any change +# here requires a change in globals.h as well. NUM_GW_CHANNELS_PER_NODE_VAR = 'NUM_GW_CHANNELS_PER_NODE' +DEFAULT_PD_VAR = 'DEFAULT_PD' +INF_PD_VAR = 'INF_PD' + +NUM_GW_TYPES = 3 # needed for naming convention for the gateway channels # should be unique on a given node, not globally # Note: there is also a dependency on dragon_channel_register_gateways_from_env() @@ -18,11 +26,11 @@ # For environment variable passing, this set is the list of dragon parameters # in capital letters. -env_vars = frozenset({'MODE', 'INDEX', 'DEFAULT_PD', 'INF_PD', 'LOCAL_SHEP_CD', 'LOCAL_BE_CD', 'GS_RET_CD', +env_vars = frozenset({'MODE', 'INDEX', DEFAULT_PD_VAR, INF_PD_VAR, 'LOCAL_SHEP_CD', 'LOCAL_BE_CD', 'GS_RET_CD', 'SHEP_RET_CD', 'GS_CD', 'DEFAULT_SEG_SZ', 'INF_SEG_SZ', 'TEST', 'DEBUG', 'MY_PUID', 'BE_CUID', 'INF_WAIT_MODE', 'USER_WAIT_MODE', 'USER_RETURN_WHEN_MODE', 'INF_RETURN_WHEN_MODE', 'GW_CAPACITY', NUM_GW_CHANNELS_PER_NODE_VAR, 'TRANSPORT_AGENT', 'HSTA_MAX_EJECTION_MB', 'HSTA_MAX_GETMSG_MB', - 'PMOD_COMMUNICATION_TIMEOUT', 'BASEPOOL'}) + 'PMOD_COMMUNICATION_TIMEOUT', 'BASEPOOL', 'OVERLAY_FANOUT', 'NET_CONF_CACHE'}) # TODO:PE-37770 This list of names should NOT need to appear @@ -614,3 +622,5 @@ def from_str(s): DEFAULT_PORT_RANGE=1000 # GS_DEFAULT_POLICY -- To prevent circular imports, this lives in policy_eval.py + +DEFAULT_NET_CONF_CACHE = os.path.join(os.getcwd(), ".dragon-net-conf") diff --git a/src/dragon/infrastructure/gpu_desc.py b/src/dragon/infrastructure/gpu_desc.py new file mode 100644 index 0000000..90809cb --- /dev/null +++ b/src/dragon/infrastructure/gpu_desc.py @@ -0,0 +1,92 @@ +import re +import enum +from dataclasses import dataclass, field +from subprocess import check_output + +GENERIC_REGEX = "((3d|display|vga) (controller|connector)?): (NVIDIA|AMD|Advanced Micro Devices)" + + +class AccVendor(enum.IntEnum): + NVIDIA = enum.auto() + AMD = enum.auto() + UNKNOWN = enum.auto() + + +class AccEnvStr(): + NVIDIA = "CUDA_VISIBLE_DEVICES" + AMD = "ROCR_VISIBLE_DEVICES" + HIP = "HIP_VISIBLE_DEVICES" + + +@dataclass +class AcceleratorDescriptor: + vendor: AccVendor = AccVendor.UNKNOWN + device_list: list[int] = field(default_factory=list) # Currently just returns a list of ints that correspond to device ID (0, 1, 2, etc) + env_str: str = "" + + def get_sdict(self): + rv = { + "vendor": self.vendor, + "device_list": self.device_list, + "env_str": self.env_str + } + return rv + + @classmethod + def from_sdict(cls, sdict): + return AcceleratorDescriptor(**sdict) + + +def find_nvidia(): + try: + output = check_output(["nvidia-smi", "-L"]).decode('utf-8').splitlines() + return output + except: + return None + + +def find_amd(): + # rocm-smi works similar to nvidia-smi but currently there is no clean "list devices" param + # lspci is easier to parse + return None + + +def find_accelerators() -> AcceleratorDescriptor: + devices = find_nvidia() + if devices is not None: + acc = AcceleratorDescriptor(vendor=AccVendor.NVIDIA, + device_list=list(range(len(devices)-1)), + env_str=AccEnvStr.NVIDIA + ) + return acc + + devices = find_amd() + if devices is not None: + return None # Not implemented, see find_amd() + + try: + output = check_output(["lspci"]).decode('utf-8').splitlines() + except FileNotFoundError as e: + # print("LSPCI not installed") # TODO: This needs to be sent to a logger somewhere + return None + + # NOTE: Will not work as expected with heterogenous setups (e.g. mixed Nvidia/AMD cards on one node) + devices = AcceleratorDescriptor() + n_devices = 0 + for line in output: + m = re.search(GENERIC_REGEX, line, re.IGNORECASE) + if m: + if m[4] == "NVIDIA": + devices.vendor = AccVendor.NVIDIA + devices.env_str = AccEnvStr.NVIDIA + elif m[4] == "AMD" or m[4] == "Advanced Micro Devices": + devices.vendor = AccVendor.AMD + devices.env_str = AccEnvStr.AMD + n_devices += 1 + + if n_devices > 0: + devices.device_list = list(range(n_devices-1)) + else: + return None + + return devices diff --git a/src/dragon/infrastructure/messages.py b/src/dragon/infrastructure/messages.py index 9ac178e..820eab0 100644 --- a/src/dragon/infrastructure/messages.py +++ b/src/dragon/infrastructure/messages.py @@ -6,12 +6,12 @@ import zlib import base64 import subprocess - +from typing import Optional, Union from dataclasses import dataclass, asdict from ..infrastructure import channel_desc from ..infrastructure import pool_desc -from ..infrastructure import node_desc +from ..infrastructure.node_desc import NodeDescriptor from ..infrastructure import process_desc from ..infrastructure import parameters as parms from ..infrastructure import facts as dfacts @@ -170,7 +170,7 @@ class MessageTypes(enum.Enum): GS_GROUP_CREATE_ADD_TO_RESPONSE = 132 #: GS_GROUP_DESTROY_REMOVE_FROM = 133 #: GS_GROUP_DESTROY_REMOVE_FROM_RESPONSE = 134 #: - + HSTA_UPDATE_NODES = 135 #: @enum.unique class FileDescriptor(enum.Enum): @@ -299,9 +299,12 @@ def err(self): def from_sdict(cls, sdict): return cls(**sdict) - def serialize(self): + def uncompressed_serialize(self): return json.dumps(self.get_sdict()) + def serialize(self): + return base64.b64encode(zlib.compress(json.dumps(self.get_sdict()).encode('utf-8'))).decode('ascii') + def __str__(self): cn = self.__class__.__name__ msg = f'{cn}: {self.tag}' @@ -348,7 +351,7 @@ class GSProcessCreate(_MsgBase): def __init__(self, tag, p_uid, r_c_uid, exe, args, env=None, rundir='', user_name='', options=None, stdin=None, stdout=None, stderr=None, group=None, user=None, umask=- 1, pipesize=None, pmi_required=False, - _pmi_info=None, layout=None, _tc=None): + _pmi_info=None, layout=None, policy=None, _tc=None): # Coerce args to a list of strings args = list(to_str_iter(args)) @@ -391,12 +394,21 @@ def __init__(self, tag, p_uid, r_c_uid, exe, args, env=None, rundir='', if layout is None: self.layout = None elif isinstance(layout, dict): - self.layout = ResourceLayout(**_pmi_info) + self.layout = ResourceLayout(**layout) elif isinstance(layout, ResourceLayout): - self.layout = _pmi_info + self.layout = layout else: raise ValueError(f'GS unsupported layout value {layout=}') + if policy is None: + self.policy = None + elif isinstance(policy, dict): + self.policy = Policy(**policy) + elif isinstance(policy, Policy): + self.policy = policy + else: + raise ValueError(f'GS unsupported policy value {policy=}') + @property def options(self): return self._options @@ -430,6 +442,7 @@ def get_sdict(self): rv['pmi_required'] = self.pmi_required rv['_pmi_info'] = None if self._pmi_info is None else asdict(self._pmi_info) rv['layout'] = None if self.layout is None else asdict(self.layout) + rv['policy'] = None if self.policy is None else asdict(self.policy) return rv @@ -2520,10 +2533,10 @@ def desc(self): @desc.setter def desc(self, value): - if isinstance(value, node_desc.NodeDescriptor): + if isinstance(value, NodeDescriptor): self._desc = value else: - self._desc = node_desc.NodeDescriptor.from_sdict(value) + self._desc = NodeDescriptor.from_sdict(value) def get_sdict(self): rv = super().get_sdict() @@ -2930,7 +2943,7 @@ def __init__(self, tag, p_uid, r_c_uid, t_p_uid, exe, args, env=None, rundir='', if layout is None: self.layout = None elif isinstance(layout, dict): - self.layout = ResourceLayout(**pmi_info) + self.layout = ResourceLayout(**layout) elif isinstance(layout, ResourceLayout): self.layout = layout else: @@ -3542,23 +3555,22 @@ class SHChannelsUp(_MsgBase): _tc = MessageTypes.SH_CHANNELS_UP - def __init__(self, tag, ip_addrs, host_id, host_name, shep_cd, gs_cd, idx=0, _tc=None): + def __init__(self, tag, node_desc, gs_cd, idx=0, _tc=None): super().__init__(tag) - self.ip_addrs = ip_addrs - self.host_id = host_id - self.host_name = host_name - self.shep_cd = shep_cd + + self.idx = idx + if isinstance(node_desc, dict): + self.node_desc = NodeDescriptor.from_sdict(node_desc) + elif isinstance(node_desc, NodeDescriptor): + self.node_desc = node_desc + # On the primary node the gs_cd is set to the base64 encoded gs channel descriptor. # Otherwise, it is ignored and presumably the empty string. self.gs_cd = gs_cd - self.idx = idx def get_sdict(self): rv = super().get_sdict() - rv['ip_addrs'] = self.ip_addrs - rv['host_id'] = self.host_id - rv['host_name'] = self.host_name - rv['shep_cd'] = self.shep_cd + rv['node_desc'] = self.node_desc.get_sdict() rv['gs_cd'] = self.gs_cd rv['idx'] = self.idx return rv @@ -3785,7 +3797,7 @@ def get_sdict(self): return rv def __str__(self): - return f'{super().__str__()}, self.data={self.data!r}, self.p_uid={self.p_uid!r}, self.fd_num={self.fd_num!r}' + return f'{super().__str__()}, self.data={self.data!r}, self.p_uid={self.p_uid!r}, self.pid={self.pid!r}, self.fd_num={self.fd_num!r}' class SHDumpState(_MsgBase): @@ -4096,31 +4108,15 @@ def __init__(self, tag, nodes_desc, gs_cd, num_gw_channels, port=dfacts.DEFAULT_ self.transport = dfacts.TransportAgentOptions.from_str(transport) self.num_gw_channels = num_gw_channels - msg = "LAChannelsInfo nodes_desc requires list of dmsg.SHChannelsUp or already arranged dict" - if isinstance(nodes_desc, list): - sh_chs = all(isinstance(node_desc, SHChannelsUp) for node_desc in nodes_desc) - if not sh_chs: - raise TypeError(msg) - self.nodes_desc = {node.idx: node_desc.NodeDescriptor(host_name=node.host_name, - name=node.host_name, - host_id=node.host_id, - ip_addrs=node.ip_addrs, - shep_cd=node.shep_cd, - port=port) - for node in nodes_desc} - elif isinstance(nodes_desc, dict): - self.nodes_desc = {key: node_desc.NodeDescriptor(name=nodes_desc[key]['name'] if 'name' in nodes_desc[key] else None, - host_name=nodes_desc[key]['host_name'], - host_id=nodes_desc[key]['host_id'], - ip_addrs=nodes_desc[key]['ip_addrs'], - shep_cd=nodes_desc[key]['shep_cd']) - for key in nodes_desc.keys()} - else: - raise TypeError(msg) + self.nodes_desc = {} + for key in nodes_desc.keys(): + if isinstance(nodes_desc[key], dict): + self.nodes_desc[key] = NodeDescriptor.from_sdict(nodes_desc[key]) + elif isinstance(nodes_desc[key], NodeDescriptor): + self.nodes_desc[key] = nodes_desc[key] - def serialize(self): - j = {'COMPRESSED': base64.b64encode(zlib.compress(json.dumps(self.get_sdict()).encode('utf-8'))).decode('ascii')} - return json.dumps(j) + if port is not None: + self.nodes_desc[key].port = port def get_sdict(self): rv = super().get_sdict() @@ -4340,13 +4336,55 @@ class FENodeIdxBE(_MsgBase): _tc = MessageTypes.FE_NODE_IDX_BE - def __init__(self, tag, node_index, _tc=None): + def __init__(self, + tag, + node_index, + forward: Optional[dict['str', Union[NodeDescriptor, dict]]] = None, + send_desc: Optional[Union[B64, str]] = None, + _tc=None): + super().__init__(tag) self.node_index = int(node_index) + self.forward = forward + self.send_desc = send_desc + + @property + def forward(self): + return self._forward + + @forward.setter + def forward(self, value): + try: + self._forward = {} + for idx, node in value.items(): + if isinstance(node, NodeDescriptor): + self._forward[idx] = node + else: + self._forward[idx] = NodeDescriptor.from_sdict(node) + except (TypeError, AttributeError): + self._forward = value + + @property + def send_desc(self): + return self._send_desc + + @send_desc.setter + def send_desc(self, value): + if isinstance(value, str): + self._send_desc = B64.from_str(value) + else: + self._send_desc = value def get_sdict(self): rv = super().get_sdict() rv['node_index'] = self.node_index + try: + rv['forward'] = self.forward.copy() + for idx in self.forward.keys(): + rv['forward'][idx] = self.forward[idx].get_sdict() + except AttributeError: + rv['forward'] = self.forward + rv['send_desc'] = str(self.send_desc) return rv @@ -4482,6 +4520,40 @@ def get_sdict(self): return rv +class TAUpdateNodes(_MsgBase): + """ + Refer to :ref:`definition` and :ref:`Common Fields` for a description of the + message structure. + + """ + _tc = MessageTypes.HSTA_UPDATE_NODES + + def __init__(self, tag, + nodes: list[Union[NodeDescriptor, dict]], + _tc=None): + super().__init__(tag) + self.nodes = nodes + + @property + def nodes(self): + return self._nodes + + @nodes.setter + def nodes(self, value): + self._nodes = [] + for node in value: + if isinstance(node, NodeDescriptor): + self._nodes.append(node) + else: + self._nodes.append(NodeDescriptor.from_sdict(node)) + + def get_sdict(self): + rv = super().get_sdict() + rv['nodes'] = [node.get_sdict() for node in self.nodes] + + return rv + + all_message_classes = [GSProcessCreate, GSProcessCreateResponse, GSProcessList, @@ -4615,23 +4687,25 @@ def get_sdict(self): GSGroupCreateAddTo, GSGroupCreateAddToResponse, GSGroupDestroyRemoveFrom, - GSGroupDestroyRemoveFromResponse] + GSGroupDestroyRemoveFromResponse, + TAUpdateNodes] mt_dispatch = {cls._tc.value: cls for cls in all_message_classes} def parse(jstring, restrict=None): + try: + # if a compressed message, decompress to get the service message + jstring = zlib.decompress(base64.b64decode(jstring)) + except zlib.error as zerr: + pass + try: sdict = json.loads(jstring) except TypeError as e: raise TypeError(f'The message "{jstring}" could not be parsed.') from e - # if a compressed message, decompress to get the service message - if 'COMPRESSED' in sdict: - jstring = zlib.decompress(base64.b64decode(sdict['COMPRESSED'])) - sdict = json.loads(jstring) - typecode = sdict['_tc'] if restrict: diff --git a/src/dragon/infrastructure/node_desc.py b/src/dragon/infrastructure/node_desc.py index d413cd1..40b84d7 100644 --- a/src/dragon/infrastructure/node_desc.py +++ b/src/dragon/infrastructure/node_desc.py @@ -14,6 +14,7 @@ from .facts import DEFAULT_TRANSPORT_NETIF, DEFAULT_OVERLAY_NETWORK_PORT, DEFAULT_PORT_RANGE from ..utils import host_id as get_host_id from .util import port_check +from .gpu_desc import AcceleratorDescriptor, find_accelerators class NodeDescriptor: @@ -47,7 +48,10 @@ def __init__( is_primary: bool = False, host_id: int = None, shep_cd: str = '', - host_name: str = '' + overlay_cd: str = '', + host_name: str = '', + cpu_devices: Optional[list[int]] = None, + accelerators: Optional[AcceleratorDescriptor] = None ): self.h_uid = h_uid self.name = name @@ -58,29 +62,26 @@ def __init__( self.host_name = host_name self.shep_cd = shep_cd + self.overlay_cd = overlay_cd self.host_id = host_id + self.cpu_devices = cpu_devices + self.accelerators = accelerators # Not a very accurate measure since we don't know when a policy group is done, # but it gives some manner of tracking for block application # TODO: This might be useful later when we can GC finished policy jobs self.num_policies = 0 - # Get a list of available CPUs on the node - self.cpu_devices = list(os.sched_getaffinity(0)) - - # Get a list of available GPUs on the node - # TODO: Currently all I'm finding is to use a tensorflow library - # Maybe exec a bash command? lspci | grep ' VGA ' - # NOTE: Apparently there is a py-lspci that might do this more easily for us - if port is not None: self.ip_addrs = [f'{ip_addr}:{port}' for ip_addr in ip_addrs] else: self.ip_addrs = ip_addrs - def __str__(self): + def __repr__(self) -> str: return f"name:{self.name}, host_id:{self.host_id} at {self.ip_addrs}, state:{self.state.name}" + def __str__(self): + return f"name:{self.name}, host_id:{self.host_id} at {self.ip_addrs}, state:{self.state.name}" @classmethod def make_for_current_node(cls, name: Optional[str] = None, ip_addrs: Optional[list[str]] = None, is_primary: bool = False): @@ -187,6 +188,47 @@ def get_local_node_network_conf(cls, raise RuntimeError(f'Could not find available port for IP address={ip_addr} in port range {port_range}') + @classmethod + def get_localservices_node_conf(cls, + name: str = "", + host_name: str = '', + host_id: int = None, + ip_addrs: Optional[list[str]] = None, + shep_cd: str = '', + cpu_devices: Optional[list[int]] = None, + accelerators: Optional[AcceleratorDescriptor] = None): + """Return a NodeDescriptor object for Local Services to pass into its SHChannelsUp message + + Populates the values in a NodeDescriptor object that Local Services needs to provide to the + launcher frontend as part of infrastructure bring-up + + :param name: Name for node. Often resorts to hostname, defaults to "" + :type name: str, optional + :param host_name: Hostname for the node, defaults to '' + :type host_name: str, optional + :param host_id: unique host ID of this node, defaults to None + :type host_id: int, optional + :param ip_addrs: IP addresses used for backend messaging by transport agents, defaults to None + :type ip_addrs: list[str], optional + :param shep_cd: Channel descriptor for this node's Local Services, defaults to '' + :type shep_cd: str, optional + :param cpu_devices: List of CPUs and IDs on this node, defaults to None + :type cpu_devices: list[int], optional + :param accelerators: List of any accelerators available on this node, defaults to None + :type accelerators: AcceleratorDescriptor, optional + """ + + from dragon.infrastructure import parameters as dparms + + return cls(state=NodeDescriptor.State.ACTIVE, + name=name, + host_name=host_name, + ip_addrs=ip_addrs, + host_id=get_host_id(), + shep_cd=dparms.this_process.local_shep_cd, + cpu_devices=list(os.sched_getaffinity(0)), + accelerators=find_accelerators()) + @property def sdesc(self): return self.get_sdict() @@ -204,8 +246,16 @@ def get_sdict(self): "num_cpus": self.num_cpus, "physical_mem": self.physical_mem, "shep_cd": self.shep_cd, + "overlay_cd": self.overlay_cd, + "cpu_devices": self.cpu_devices } + # Account for a NULL accelerator giving us a None for now + try: + rv["accelerators"] = self.accelerators.get_sdict() + except AttributeError: + rv["accelerators"] = None + return rv @classmethod diff --git a/src/dragon/infrastructure/parameters.py b/src/dragon/infrastructure/parameters.py index f6c7857..4118f0b 100644 --- a/src/dragon/infrastructure/parameters.py +++ b/src/dragon/infrastructure/parameters.py @@ -209,59 +209,63 @@ class LaunchParameters: @classmethod def init_class_vars(cls): + typecast = lambda ty: lambda val: ty() if val == "" else ty(val) + PARMS = [ - TypedParm(name=dfacts.MODE, cast=str, check=check_mode, default=dfacts.TEST_MODE), - TypedParm(name=dfacts.INDEX, cast=int, check=nonnegative, default=0), - TypedParm(name=dfacts.DEFAULT_PD, cast=str, check=check_base64, default=""), - TypedParm(name=dfacts.INF_PD, cast=str, check=check_base64, default=""), - TypedParm(name=dfacts.LOCAL_SHEP_CD, cast=str, check=check_base64, default=""), - TypedParm(name=dfacts.LOCAL_BE_CD, cast=str, check=check_base64, default=""), - TypedParm(name=dfacts.GS_RET_CD, cast=str, check=check_base64, default=""), - TypedParm(name=dfacts.SHEP_RET_CD, cast=str, check=check_base64, default=""), - TypedParm(name=dfacts.GS_CD, cast=str, check=check_base64, default=""), + TypedParm(name=dfacts.MODE, cast=typecast(str), check=check_mode, default=dfacts.TEST_MODE), + TypedParm(name=dfacts.INDEX, cast=typecast(int), check=nonnegative, default=0), + TypedParm(name=dfacts.DEFAULT_PD, cast=typecast(str), check=check_base64, default=""), + TypedParm(name=dfacts.INF_PD, cast=typecast(str), check=check_base64, default=""), + TypedParm(name=dfacts.LOCAL_SHEP_CD, cast=typecast(str), check=check_base64, default=""), + TypedParm(name=dfacts.LOCAL_BE_CD, cast=typecast(str), check=check_base64, default=""), + TypedParm(name=dfacts.GS_RET_CD, cast=typecast(str), check=check_base64, default=""), + TypedParm(name=dfacts.SHEP_RET_CD, cast=typecast(str), check=check_base64, default=""), + TypedParm(name=dfacts.GS_CD, cast=typecast(str), check=check_base64, default=""), TypedParm( name=dfacts.DEFAULT_SEG_SZ, - cast=int, + cast=typecast(int), check=positive, default=int(dfacts.DEFAULT_SINGLE_DEF_SEG_SZ), ), TypedParm( - name=dfacts.INF_SEG_SZ, cast=int, check=positive, default=int(dfacts.DEFAULT_SINGLE_INF_SEG_SZ) + name=dfacts.INF_SEG_SZ, cast=typecast(int), check=positive, default=int(dfacts.DEFAULT_SINGLE_INF_SEG_SZ) ), - TypedParm(name=dfacts.MY_PUID, cast=int, check=positive, default=1), - TypedParm(name=dfacts.TEST, cast=int, check=nonnegative, default=0), - TypedParm(name=dfacts.DEBUG, cast=int, check=nonnegative, default=0), - TypedParm(name=dfacts.BE_CUID, cast=int, check=positive, default=dfacts.BASE_BE_CUID), + TypedParm(name=dfacts.MY_PUID, cast=typecast(int), check=positive, default=1), + TypedParm(name=dfacts.TEST, cast=typecast(int), check=nonnegative, default=0), + TypedParm(name=dfacts.DEBUG, cast=typecast(int), check=nonnegative, default=0), + TypedParm(name=dfacts.BE_CUID, cast=typecast(int), check=positive, default=dfacts.BASE_BE_CUID), TypedParm( name=dfacts.INF_WAIT_MODE, - cast=cast_wait_mode, + cast=typecast(cast_wait_mode), check=nocheck, default=dfacts.INFRASTRUCTURE_DEFAULT_WAIT_MODE, ), TypedParm( name=dfacts.USER_WAIT_MODE, - cast=cast_wait_mode, + cast=typecast(cast_wait_mode), check=nocheck, default=dfacts.USER_DEFAULT_WAIT_MODE, ), TypedParm( name=dfacts.INF_RETURN_WHEN_MODE, - cast=cast_return_when_mode, + cast=typecast(cast_return_when_mode), check=nocheck, default=dfacts.INFRASTRUCTURE_DEFAULT_RETURN_WHEN_MODE, ), TypedParm( name=dfacts.USER_RETURN_WHEN_MODE, - cast=cast_return_when_mode, + cast=typecast(cast_return_when_mode), check=nocheck, default=dfacts.USER_DEFAULT_RETURN_WHEN_MODE, ), - TypedParm(name=dfacts.GW_CAPACITY, cast=int, check=positive, default=dfacts.GW_DEFAULT_CAPACITY), - TypedParm(name=dfacts.NUM_GW_CHANNELS_PER_NODE, cast=int, check=nonnegative, default=dfacts.DRAGON_DEFAULT_NUM_GW_CHANNELS_PER_NODE), - TypedParm(name=dfacts.HSTA_MAX_EJECTION_MB, cast=int, check=positive, default=8), - TypedParm(name=dfacts.HSTA_MAX_GETMSG_MB, cast=int, check=positive, default=8), - TypedParm(name=dfacts.PMOD_COMMUNICATION_TIMEOUT, cast=int, check=positive, default=30), - TypedParm(name=dfacts.BASEPOOL, cast=str, check=check_pool, default=dfacts.DEFAULT_POOL), + TypedParm(name=dfacts.GW_CAPACITY, cast=typecast(int), check=positive, default=dfacts.GW_DEFAULT_CAPACITY), + TypedParm(name=dfacts.NUM_GW_CHANNELS_PER_NODE, cast=typecast(int), check=nonnegative, default=dfacts.DRAGON_DEFAULT_NUM_GW_CHANNELS_PER_NODE), + TypedParm(name=dfacts.HSTA_MAX_EJECTION_MB, cast=typecast(int), check=positive, default=8), + TypedParm(name=dfacts.HSTA_MAX_GETMSG_MB, cast=typecast(int), check=positive, default=8), + TypedParm(name=dfacts.PMOD_COMMUNICATION_TIMEOUT, cast=typecast(int), check=positive, default=30), + TypedParm(name=dfacts.BASEPOOL, cast=typecast(str), check=check_pool, default=dfacts.DEFAULT_POOL), + TypedParm(name=dfacts.OVERLAY_FANOUT, cast=typecast(int), check=positive, default=32), + TypedParm(name=dfacts.NET_CONF_CACHE, cast=typecast(str), check=nocheck, default=dfacts.DEFAULT_NET_CONF_CACHE), ] env = os.environ @@ -276,7 +280,7 @@ def init_class_vars(cls): cls.gw_env_vars = frozenset([f'GW{x+1}' for x in range(num_gateways)]) for gw_env_var in cls.gw_env_vars: - PARMS.append(TypedParm(gw_env_var, cast=str, check=nocheck, default="")) + PARMS.append(TypedParm(gw_env_var, cast=typecast(str), check=nocheck, default="")) cls.PARMS = PARMS diff --git a/src/dragon/infrastructure/policy.py b/src/dragon/infrastructure/policy.py index fef616b..0a8c2d4 100644 --- a/src/dragon/infrastructure/policy.py +++ b/src/dragon/infrastructure/policy.py @@ -1,10 +1,36 @@ import enum from dataclasses import asdict, dataclass, field +from dragon.infrastructure.util import stack +import threading + @dataclass class Policy: """ - Dataclass to describe what node, affinity, and pattern to distrbiute a Policy to + The purpose of the Policy object is to enable fine-tuning of process distribution + across nodes, devices, cores, and etc. + For example spawning a process that should only run on certain CPU cores, or only have + access to specific GPU devices on a node, or how to spread a collection of processes + over nodes (e.g. Round-robin, Blocking, etc). + + There is a default global Policy in place that utilizes no core or accelerator affinity, + and places processes across nodes in a round-robin fashion. Using Policy is meant to fine-tune + this default policy, and any non-specific attributes will use the default policy values. + + For a full example of setup for launching processes through Global Services, see `examples/dragon_gs_client/pi_demo.py` + When launching a process simply create a policy with the desired fine-tuning, and pass it in as a paramter. + + .. code-block:: python + import dragon.globalservices.process as dgprocess + import dragon.infrastructure.policy as dgpol + + # Do setup + # ... + # ... + # Create a custom process for core affinity and selecting a specific GPU + policy = dgpol.Policy(core_affinity=[0,2], gpu_affinity=[0]) + # Launch the process with the fine-tuned policy + p = dgprocess.create(cmd, wdir, cmd_params, None, options=options, policy=policy) """ class Placement(enum.IntEnum): @@ -14,17 +40,29 @@ class Placement(enum.IntEnum): Local and Anywhere will be useful later for multi-system communication Right now Placement will have little effect unless HOST_NAME or HOST_ID are used, which will try to place a policy on the specified node + + LOCAL - Local to current system of nodes + ANYWHERE - Place anywhere + HOST_NAME - Place on node with specific name + HOST_ID - Place on node with specific ID + DEFAULT - Defaults to ANYWHERE """ - LOCAL = -5 # Local to the current system of nodes - ANYWHERE = -4 # What it says on the tin - HOST_NAME = -3 # Tells evaluator to check host_name in policy and apply to that node - HOST_ID = -2 # Tells evaluator to check host_id in policy and apply to that node - DEFAULT = -1 # For now, defaults to ANYWHERE + + LOCAL = -5 # Local to the current system of nodes + ANYWHERE = -4 # What it says on the tin + HOST_NAME = -3 # Tells evaluator to check host_name in policy and apply to that node + HOST_ID = -2 # Tells evaluator to check host_id in policy and apply to that node + DEFAULT = -1 # For now, defaults to ANYWHERE class Distribution(enum.IntEnum): """ Pattern to use to distribute policies across nodes + + ROUNDROBIN + BLOCK + DEFAULT - Defaults to roundrobin """ + ROUNDROBIN = enum.auto() BLOCK = enum.auto() DEFAULT = enum.auto() @@ -41,12 +79,17 @@ class Device(enum.IntEnum): class Affinity(enum.IntEnum): """ Device Affinity distribution + + ANY - Place on any available core or GPU device + SPECIFIC - Place to specific CPU core affinities or GPU device + DEFAULT - Defaults to Any """ + ANY = -5 # ROUNDROBIN = -4 # TODO: Not implemented # BLOCK = -3 # TODO: Not implemented SPECIFIC = -2 - DEFAULT = -1 # Same as ANY + DEFAULT = -1 # Same as ANY # TODO: Not implemented class WaitMode(enum.IntEnum): @@ -55,15 +98,19 @@ class WaitMode(enum.IntEnum): SPIN = enum.auto() DEFAULT = enum.auto() - placement : Placement = Placement.DEFAULT - host_name : str = "" # To be populated by caller in use with Placement.HOST_NAME - host_id : int = -1 # To be populated by caller in use with Placement.HOST_ID - distribution : Distribution = Distribution.DEFAULT - device : Device = Device.DEFAULT - affinity : Affinity = Affinity.DEFAULT - specific_affinity : list[int] = field(default_factory=list) # To be populated by called in use with Affinity.SPECIFIC - wait_mode : WaitMode = WaitMode.DEFAULT # TODO (For channels) - refcounted : bool = True # TODO (Apply refcounting for this resource) + placement: Placement = Placement.DEFAULT + host_name: str = "" # To be populated by caller in use with Placement.HOST_NAME + host_id: int = -1 # To be populated by caller in use with Placement.HOST_ID + distribution: Distribution = Distribution.DEFAULT + device: Device = Device.DEFAULT # What device to apply to + affinity: Affinity = Affinity.DEFAULT # Affinity distribution + cpu_affinity: list[int] = field( + default_factory=list + ) # To be populated by called in use with Affinity.SPECIFIC, specify exact device IDs or cores + gpu_env_str: str = "" # To be used with gpu_affinity for vendor specific environment vars + gpu_affinity: list[int] = field(default_factory=list) + wait_mode: WaitMode = WaitMode.DEFAULT # TODO (For channels) + refcounted: bool = True # TODO (Apply refcounting for this resource) @property def sdesc(self): @@ -71,15 +118,17 @@ def sdesc(self): def get_sdict(self): rv = { - "placement" : self.placement, - "host_name" : self.host_name, - "host_id" : self.host_id, - "distribution" : self.distribution, - "device" : self.device, - "affinity" : self.affinity, - "specific_affinity" : self.specific_affinity, - "wait_mode" : self.wait_mode, - "refcounted" : self.refcounted, + "placement": self.placement, + "host_name": self.host_name, + "host_id": self.host_id, + "distribution": self.distribution, + "device": self.device, + "affinity": self.affinity, + "cpu_affinity": self.cpu_affinity, + "gpu_env_str": self.gpu_env_str, + "gpu_affinity": self.gpu_affinity, + "wait_mode": self.wait_mode, + "refcounted": self.refcounted, } return rv @@ -88,6 +137,27 @@ def get_sdict(self): def from_sdict(cls, sdict): return Policy(**sdict) + @classmethod + def _init_global_policy(cls): + # This is put in thread-local storage to insure that it behaves in a + # multi-threaded environment. + if not hasattr(_policy_local, "policy_stack"): + _policy_local.policy_stack = stack([GS_DEFAULT_POLICY]) + + @classmethod + def global_policy(cls): + cls._init_global_policy() + # The following clones the policy so that the original is not + # mutated should the user mutate the returned policy. + return cls.from_sdict(_policy_local.policy_stack.top().get_sdict()) + + def __enter__(self): + self._init_global_policy() + _policy_local.policy_stack.push(self) + + def __exit__(self, *args): + _policy_local.policy_stack.pop() + # Default policy to merge incoming policies against GS_DEFAULT_POLICY = Policy( @@ -97,7 +167,12 @@ def from_sdict(cls, sdict): distribution=Policy.Distribution.ROUNDROBIN, device=Policy.Device.CPU, affinity=Policy.Affinity.ANY, - specific_affinity=[], + cpu_affinity=[], + gpu_env_str="", + gpu_affinity=[], wait_mode=Policy.WaitMode.IDLE, - refcounted=True -) \ No newline at end of file + refcounted=True, +) + +# _policy_local is private to the current thread +_policy_local = threading.local() diff --git a/src/dragon/infrastructure/util.py b/src/dragon/infrastructure/util.py index 73d6d07..e98f4b4 100644 --- a/src/dragon/infrastructure/util.py +++ b/src/dragon/infrastructure/util.py @@ -45,20 +45,20 @@ def _get_access_modes(f): class NewlineStreamWrapper: """Helper class for sending json objects through streams. - This class sends newline-delimited newline-free strings - through streams with an interface similar to multiprocessing.Connection. + This class sends newline-delimited newline-free strings + through streams with an interface similar to multiprocessing.Connection. - It is used for sending around JSON-encoded infrastructure - messages through streams and in testing. + It is used for sending around JSON-encoded infrastructure + messages through streams and in testing. - TODO: consider whether this belongs in base code. It feels more - like something that should only be in a test bench. + TODO: consider whether this belongs in base code. It feels more + like something that should only be in a test bench. """ def __init__(self, stream, read_intent=None, write_intent=None): """ - read_intent and write_intent are used to ensure - clients are using the right operations on the stream. + read_intent and write_intent are used to ensure + clients are using the right operations on the stream. """ try: readable, writeable = _get_access_modes(stream) @@ -69,9 +69,9 @@ def __init__(self, stream, read_intent=None, write_intent=None): self.read_intent = bool(readable if read_intent is None else read_intent) self.write_intent = bool(writeable if write_intent is None else write_intent) if self.read_intent and not readable: - warn(f'Intend to read a stream not open for reading: {stream}') + warn(f"Intend to read a stream not open for reading: {stream}") if self.write_intent and not writeable: - warn(f'Intend to write a stream not open for writing: {stream}') + warn(f"Intend to write a stream not open for writing: {stream}") self.stream = stream def send(self, data): @@ -83,9 +83,9 @@ def send(self, data): :type data: string """ try: - assert self.write_intent, 'sending to a read wrap' + assert self.write_intent, "sending to a read wrap" assert isinstance(data, str) - msg_str = data.strip().replace('\n', ' ') + '\n' + msg_str = data.strip().replace("\n", " ") + "\n" # try to tolerate if the stream is a text mode stream if isinstance(self.stream, io.TextIOBase): @@ -104,7 +104,7 @@ def recv(self): """Perform the read operation on the stream. Assert that the read_intent is set, ensuring the client is using the right operation on the stream. """ - assert self.read_intent, 'receiving from a write wrap' + assert self.read_intent, "receiving from a write wrap" line = self.stream.readline() # try to tolerate if the stream is a text stream @@ -116,7 +116,7 @@ def recv(self): return stuff def poll(self, timeout=0): - """ Poll for the read I/O events on the registered file objects of type stream. + """Poll for the read I/O events on the registered file objects of type stream. Collect the selector if already set and its associated events, else the default selector and register with the read event, to wait upon. Also, assert that the ride_intent is set, ensuring the client is using the right operation on the stream. @@ -135,7 +135,6 @@ def poll(self, timeout=0): except AttributeError: pass - try: sel = self.selector except AttributeError: @@ -163,13 +162,13 @@ def close(self): class PriorityMultiMap: """Class combining a multimap with a priority queue to allow timeouts. - Correct behavior relies on keys never being reused. + Correct behavior relies on keys never being reused. - This is used in Global Services to manage requests for timeout - notification from multiple sources. + This is used in Global Services to manage requests for timeout + notification from multiple sources. - TODO: this needs some unit tests - TODO: consider reimplementing in terms of a collections.Counter object! + TODO: this needs some unit tests + TODO: consider reimplementing in terms of a collections.Counter object! """ def __init__(self): @@ -177,7 +176,7 @@ def __init__(self): self.dead_keys = set() self.timeout_keys = set() self.timeout_pq = list() - self.deadlines = dict() # key: (key,value), value: deadline + self.deadlines = dict() # key: (key,value), value: deadline def put(self, key, value, timeout=None): """Store (key, value) pair to the map (internal items), used by the global requests @@ -349,7 +348,7 @@ def survey_dev_shm(): """ my_id = os.getuid() - with os.scandir('/dev/shm') as it: + with os.scandir("/dev/shm") as it: # during a certain shutdown race condition, FileNotFoundError may be raised. try: rv = {entry.name for entry in it if entry.stat().st_uid == my_id} @@ -373,14 +372,14 @@ def compare_dev_shm(previous): remaining = now - previous if remaining: - print(f'warning: {len(remaining)} leftover files in /dev/shm: ') + print(f"warning: {len(remaining)} leftover files in /dev/shm: ") for fn in remaining: print(fn) def route(msg_type, routing_table, metadata=None): - """ Decorator routing adapter. + """Decorator routing adapter. This is a function decorator used to accumulate handlers for a particular kind of message into a routing table indexed by type, used @@ -407,16 +406,17 @@ def decorator_route(f): # TODO: PE-37739, the full on client debug hookup story. def mk_fifo_debugger(basename, *, override_bph=False, quiet=False): - dbg_in_fn = basename + '_dbg_in' - dbg_out_fn = basename + '_dbg_out' + dbg_in_fn = basename + "_dbg_in" + dbg_out_fn = basename + "_dbg_out" os.mkfifo(dbg_in_fn) os.mkfifo(dbg_out_fn) if not quiet: - print(f'fifos made: in: {dbg_in_fn} out: {dbg_out_fn}; waiting on open') + print(f"fifos made: in: {dbg_in_fn} out: {dbg_out_fn}; waiting on open") sys.stdout.flush() - in_fifo = open(dbg_in_fn, 'r') - out_fifo = open(dbg_out_fn, 'w') + in_fifo = open(dbg_in_fn, "r") + out_fifo = open(dbg_out_fn, "w") import pdb + debugger = pdb.Pdb(stdin=in_fifo, stdout=out_fifo) if override_bph: @@ -428,8 +428,8 @@ def mk_fifo_debugger(basename, *, override_bph=False, quiet=False): def to_str(x): """Convert anything to a string""" if x is None: - return '' - if hasattr(x, 'decode'): + return "" + if hasattr(x, "decode"): return x.decode() return str(x) @@ -467,7 +467,9 @@ def to_str_iter(seq): yield from map(to_str, seq) -def range_expr(s: str, prog: re.Pattern = re.compile(r'(?P\d+)(?:-(?P\d+)(?::(?P\d+))?)?$')) -> Iterable[int]: +def range_expr( + s: str, prog: re.Pattern = re.compile(r"(?P\d+)(?:-(?P\d+)(?::(?P\d+))?)?$") +) -> Iterable[int]: """Return iterable corresponding to the given range expression of the form START[-STOP[:STEP]][,...]. @@ -482,14 +484,16 @@ def range_expr(s: str, prog: re.Pattern = re.compile(r'(?P\d+)(?:-(?P= FIRST_PUID: - kwargs['flush'] = True + kwargs["flush"] = True print(*args, **kwargs) @@ -525,8 +529,8 @@ def get_host_info(network_prefix) -> tuple[str, str, list[str]]: from dragon.transport.ifaddrs import getifaddrs, InterfaceAddressFilter from ..dlogging.util import DragonLoggingServices as dls - log = logging.getLogger(dls.LA_BE).getChild('get_host_info') - _user = os.environ.get('USER', str(os.getuid())) + log = logging.getLogger(dls.LA_BE).getChild("get_host_info") + _user = os.environ.get("USER", str(os.getuid())) ifaddr_filter = InterfaceAddressFilter() ifaddr_filter.af_inet(inet6=False) # Disable IPv6 for now @@ -534,10 +538,10 @@ def get_host_info(network_prefix) -> tuple[str, str, list[str]]: try: ifaddrs = list(filter(ifaddr_filter, getifaddrs())) except OSError: - log.exception('Failed to get network interface addresses') + log.exception("Failed to get network interface addresses") raise if not ifaddrs: - _msg = 'No network interface with an AF_INET address was found' + _msg = "No network interface with an AF_INET address was found" log.error(_msg) raise RuntimeError(_msg) @@ -550,11 +554,51 @@ def get_host_info(network_prefix) -> tuple[str, str, list[str]]: ifaddr_filter.clear() ifaddr_filter.name_re(re.compile(re_prefix)) - ip_addrs = [ifa['addr']['addr'] for ifa in filter(ifaddr_filter, ifaddrs)] + ip_addrs = [ifa["addr"]["addr"] for ifa in filter(ifaddr_filter, ifaddrs)] if not ip_addrs: - _msg = f'No IP addresses found matching regex pattern: {network_prefix}' + _msg = f"No IP addresses found matching regex pattern: {network_prefix}" log.error(_msg) raise RuntimeError(_msg) log.info(f"Found IP addresses: {','.join(ip_addrs)}") return _user, ip_addrs + + +class stack: + """This provides a traditional stack implementation for use in the dragon infrastructure.""" + + def __init__(self, initial_items=[]): + # A stack can be given an initial contents. The first item in initial_items will be on the bottom of the constructed stack. + self.items = [] + for item in initial_items: + self.push(item) + + def pop(self): + # Pop returns the last item pushed and raises a RuntimeError if the stack is currently empty. + if self.isEmpty(): + raise RuntimeError("Attempt to pop an empty stack") + + topIdx = len(self.items) - 1 + item = self.items[topIdx] + del self.items[topIdx] + return item + + def push(self, item): + # pushes an item onto the top of the stack + self.items.append(item) + + def top(self): + # Top returns the top item of the stack or raises an RuntimeError if the stack is currently empty. + if self.isEmpty(): + raise RuntimeError("Attempt to get top of empty stack") + + topIdx = len(self.items) - 1 + return self.items[topIdx] + + def isEmpty(self): + # isEmpty returns true when the stack is not empty and false otherwise. + return len(self.items) == 0 + + def clear(self): + # Clears the stack of all items, resetting it to an empty stack. + self.items = [] diff --git a/src/dragon/launcher/backend.py b/src/dragon/launcher/backend.py index d59bf3c..293c561 100644 --- a/src/dragon/launcher/backend.py +++ b/src/dragon/launcher/backend.py @@ -21,7 +21,7 @@ from ..infrastructure import messages as dmsg from ..infrastructure.node_desc import NodeDescriptor from ..infrastructure.connection import Connection, ConnectionOptions -from ..infrastructure.parameters import POLICY_INFRASTRUCTURE +from ..infrastructure.parameters import POLICY_INFRASTRUCTURE, this_process from ..infrastructure.util import NewlineStreamWrapper, route @@ -127,6 +127,12 @@ def __init__(self, transport_test_env, network_prefix): self._shutdown = threading.Event() self._logging_shutdown = threading.Event() + # Only the primary launcher backend communicates with GlobalServices. + # Initialize gs_channel and gs_queue to None to enable us to verify that + # they are properly initialized before use. + self.gs_channel = None + self.gs_queue = None + def __enter__(self): return self @@ -264,6 +270,16 @@ def _close_conns_chs_pools(self): except Exception: pass + try: + log.debug('frontend_fwd_conns close') + for conn in self.frontend_fwd_conns.values(): + try: + conn.close() + except Exception: + pass + except Exception: + pass + try: log.debug('gs_queue close') self.gs_queue.close() @@ -321,6 +337,7 @@ def _close_conns_chs_pools(self): try: log.debug('be_mpool destroy') self.be_mpool.destroy() + del self.be_mpool except Exception: pass @@ -398,14 +415,28 @@ def _close_overlay_comms(self): pass self.local_inout.close() log.debug('local_inout closed') - self.local_ch_in.destroy() - log.debug('local_ch_in destroyed') - self.local_ch_out.destroy() - log.debug('local_ch_out destroyed') - self.be_inbound.destroy() - log.debug('be_inbound destroyed') - self.be_mpool.destroy() - log.debug('be_mpool destroyed') + + for conn in self.frontend_fwd_conns.values(): + conn.close() + log.debug('closed all backend forwarding connections') + + try: + self.local_ch_in.destroy() + log.debug('local_ch_in destroyed') + except Exception: + pass + + try: + self.local_ch_out.destroy() + log.debug('local_ch_out destroyed') + except Exception: + pass + + try: + self.be_inbound.destroy() + log.debug('be_inbound destroyed') + except Exception: + pass log.info('overlay infra down') @@ -424,6 +455,56 @@ def _start_localservices(self): # Return the file descriptors to make mocking easier return ls_proc, ls_proc.stdin.fileno(), ls_proc.stdout.fileno() + def _get_my_child_nodes(self, index, nnodes, fanout): + """Get the node IDs I need to forward bcast messages to + + :param index: my node index, assume 0-indexed counting (ie: start counting at 0) + :type index: int + :param nnodes: number of nodes on backend + :type nnodes: int + :param fanout: number of nodes a given node will send to + :type fanout: int + """ + + return list(range(fanout * (index+1), min(nnodes, fanout * (index+1) + fanout))) + + def _construct_child_forwarding(self, + node_info: dict[NodeDescriptor]): + """Set up any infrastructure messages I must forward to other backend nodes + + :param node_info: node indices and channel descriptors + :type node_info: dict[NodeDescriptor] + """ + + log = logging.getLogger(dls.LA_BE).getChild('_construct_child_forwarding') + + # Figure out what nodes I need to forward to + nnodes = len(node_info) + fanout = this_process.overlay_fanout + fwd_conns = {} + + # I need to work out what nodes I'm responsible for + log.info(f'getting children for fan {fanout} and {nnodes} nodes. My idx = {self.node_idx}') + ids_to_forward = self._get_my_child_nodes(self.node_idx, nnodes, fanout) + if 0 == len(ids_to_forward): + # my index is in the last tree layer, I don't need to forward + log.info(f'{self.node_idx} is a leaf node') + return fwd_conns + + log.info(f'{self.node_idx} forward to nodes {ids_to_forward}') + + conn_options = ConnectionOptions(default_pool=self.be_mpool, min_block_size=2 ** 16) + conn_policy = POLICY_INFRASTRUCTURE + for idx in ids_to_forward: + outbound = Channel.attach(B64.from_str(node_info[str(idx)].overlay_cd).decode(), + mem_pool=self.be_mpool) + fwd_conns[idx] = Connection(outbound_initializer=outbound, + options=conn_options, + policy=conn_policy) + fwd_conns[idx].ghost = True + + return fwd_conns + def run_startup(self, arg_ip_addr: str, arg_host_id: str, @@ -550,6 +631,28 @@ def run_startup(self, log.error(f'node_idx = {self.node_idx} | invalid node id, error starting LA BE') log.debug(f'my node index is {self.node_idx}') + # Update my overlay network with the extra IP and host ID info: + log.debug(f'Before sending TAUpdateNodes') + update_msg = dmsg.TAUpdateNodes(tag=dlutil.next_tag(), + nodes=list(fe_node_idx_msg.forward.values())) + log.debug(f'sending TAUpdateNodes to overlay: {update_msg.uncompressed_serialize()}') + self.local_inout.send(update_msg.serialize()) + + # If I'm a tree child of the frontend, I need to forward infrastructure messages to + # specific children of my own + self.frontend_fwd_conns = self._construct_child_forwarding(fe_node_idx_msg.forward) + log.debug(f'connections to forward to: {self.frontend_fwd_conns}') + + # If I have any FENodeIdxBE messages to propogate, do it now, + # filling in that node's index as well as my channel descriptor, for when + # we eventually have a hierarchical reduce implemented + for idx, conn in self.frontend_fwd_conns.items(): + fe_node_idx = dmsg.FENodeIdxBE(tag=dlutil.next_tag(), + node_index=int(idx), + forward=fe_node_idx_msg.forward, + send_desc=self.be_inbound) + conn.send(fe_node_idx.serialize()) + # This starts the "down the tree" router. log.debug('starting frontend monitor') recv_msgs_from_overlaynet_thread_args = (self.la_be_stdin,) @@ -651,6 +754,8 @@ def receive_messages_from_overlaynet(self, la_be_stdin: dlutil.SRQueue): break la_be_stdin.send(fe_msg.serialize()) log.info(f'received {type(fe_msg)} from Launcher FE and sent to Launcher BE') + + # Forwad the message to any if isinstance(fe_msg, dmsg.BEHalted): log.debug("Exiting receive_messages_from_overlaynet") running = False @@ -698,6 +803,8 @@ def send_messages_to_overlaynet(self, la_be_stdout: dlutil.SRQueue): # Do any specific to the message handling then forward it along self.to_overlaynet_log.info(f"received {type(msg)}") if type(msg) in LauncherBackEnd._DTBL: + if isinstance(msg, dmsg.SHFwdOutput): + self.to_overlaynet_log.debug(f"{msg}") self.infra_out.send(msg.serialize()) self.to_overlaynet_log.info(f"forwarded {type(msg)} to frontend") @@ -810,9 +917,29 @@ def run_msg_server(self): # Close infrasructure communication self._close_overlay_comms() + def forward_to_leaves(self, msg): + """Forward an infrastructure messages to leaves I'm responsible for + + :param msg: Infrastructure message + :type msg: dragon.infrastructure.messages._MsgBase + """ + for conn in self.frontend_fwd_conns.values(): + conn.send(msg.serialize()) + @route(dmsg.GSProcessCreate, _DTBL) def handle_gs_process_create(self, msg: dmsg.GSProcessCreate): - self.msg_log.info('received a GSProcess Create in the Launcher Backend and forwarded to GS.') + + # The GSProcessCreate message should only be received by the Primary Launcher Backend process. + # Verify that we're the primary and that our connection to Global Services is established. + if (not self.is_primary) or self.transport_test_env: + self.msg_log.error('Non-primary LA_BE received GSProcessCreate Request. Cannot forward request to GlobalServices.') + raise RuntimeError('Non-primary LA_BE received GSProcessCreate Request.') + + if not self.gs_queue: + self.msg_log.error('Primary LA_BE received GSProcessCreate Request but gs_queue not established. Cannot forward request to GlobalServices.') + raise RuntimeError('Primary LA_BE received GSProcessCreate Request but gs_queue not established.') + + self.msg_log.info('Primary la_be received a GSProcess Create in the Launcher Backend and forwarded to GS.') self.gs_queue.send(msg.serialize()) @route(dmsg.GSProcessCreateResponse, _DTBL) @@ -830,6 +957,12 @@ def handle_gs_head_exit(self, msg: dmsg.GSHeadExit): @route(dmsg.GSTeardown, _DTBL) def handle_gs_teardown(self, msg: dmsg.GSTeardown): + # The GSTeardown message should only be received by the Primary Launcher Backend process. + # Verify that we're the primary and that our connection to Global Services is established. + if (not self.is_primary) or self.transport_test_env: + self.msg_log.error('Non-primary LA_BE received GSTeardown Request. Cannot forward request to GlobalServices.') + raise RuntimeError('Non-primary LA_BE received GSTeardown Request.') + # If we've been told to teardown before infrastructure is fully up, # we may be in trouble, so guard against that self.msg_log.info('m4.1 primary la_be received GSTeardown') @@ -844,6 +977,10 @@ def handle_gs_teardown(self, msg: dmsg.GSTeardown): def handle_sh_halt_ta(self, msg: dmsg.SHHaltTA): self.msg_log.info('m7.1 la_be received SHHaltTA') try: + # Forward to leaves + self.forward_to_leaves(msg) + + # Try forwarding ot local services if self._state >= BackendState.LS_UP: self.ls_queue.send(msg.serialize()) self.msg_log.info('m7.2 la_be forwarded SHHaltTA to LS') @@ -857,9 +994,14 @@ def handle_sh_halt_ta(self, msg: dmsg.SHHaltTA): @route(dmsg.SHTeardown, _DTBL) def handle_sh_teardown(self, msg: dmsg.SHTeardown): + self.msg_log.info('m11.1 la_be received SHTeardown') - self.msg_log.info('m11.2 la_be forwarded SHTeardown to LS') + + self.forward_to_leaves(msg) + self.msg_log.info('forward shteardown to backend leaves') + self.ls_queue.send(msg.serialize()) + self.msg_log.info('m11.2 la_be forwarded SHTeardown to LS') @route(dmsg.BEHalted, _DTBL) def handle_be_halted(self, msg: dmsg.BEHalted): @@ -867,6 +1009,9 @@ def handle_be_halted(self, msg: dmsg.BEHalted): self._shutdown.set() self.la_be_stdout.send(msg.serialize()) + self.forward_to_leaves(msg) + self.msg_log.info('forward BEHalted to backend leaves') + # This is the dmsg.BEHalted self.ls_stdin.send(msg.serialize()) self.msg_log.debug("la_be forwarded m14 BEHalted") @@ -893,7 +1038,10 @@ def handle_la_channels_info(self, msg: dmsg.LAChannelsInfo): if self.is_primary and (not self.transport_test_env): self.gs_channel = Channel.attach(B64.from_str(msg.gs_cd).decode()) self.gs_queue = Connection(outbound_initializer=self.gs_channel, policy=POLICY_INFRASTRUCTURE) - self.msg_log.debug(f'Created gs_queue: {self.gs_queue}') + self.msg_log.debug(f'Primary la_be created gs_queue: {self.gs_queue}') + + # Forward the message to my leaves + self.forward_to_leaves(msg) # Forward the LAChannelsInfo Message to Local Services self.ls_queue.send(msg.serialize()) diff --git a/src/dragon/launcher/dragon_single.py b/src/dragon/launcher/dragon_single.py index 9359e64..fb47fe1 100755 --- a/src/dragon/launcher/dragon_single.py +++ b/src/dragon/launcher/dragon_single.py @@ -210,4 +210,4 @@ def main(): if __name__ == '__main__': ecode = main() - exit(ecode) + sys.exit(ecode) diff --git a/src/dragon/launcher/frontend.py b/src/dragon/launcher/frontend.py index 0764cd7..8dd1823 100644 --- a/src/dragon/launcher/frontend.py +++ b/src/dragon/launcher/frontend.py @@ -7,7 +7,6 @@ import subprocess from enum import Enum from shlex import quote - from functools import total_ordering from ..utils import B64 @@ -140,6 +139,7 @@ def __init__(self, args_map, sigint_trigger=None): self.args_map = args_map self.nnodes = args_map.get('node_count', 0) + self.ntree_nodes = this_process.overlay_fanout self.network_prefix = args_map.get('network_prefix', dfacts.DEFAULT_TRANSPORT_NETIF) self.port = args_map.get('port', dfacts.DEFAULT_TRANSPORT_PORT) self._config_from_file = args_map.get('network_config', None) @@ -621,7 +621,6 @@ def _launch_backend(self, the_env = dict(os.environ) the_env['DRAGON_NETWORK_CONFIG'] = self.net.compress() - # TODO: The differentiation between the SSH path vs. other paths # is clunky. Ideally, this could be abstracted to make the # if/else disappear @@ -673,6 +672,61 @@ def _launch_backend(self, return wlm_proc + def construct_bcast_tree(self, net_conf, conn_policy, be_ups, frontend_sdesc): + + log = logging.getLogger(dls.LA_FE).getChild('construct_bcast_tree') + + # Pack up all of our node descriptors for the backend: + forwarding = {} + for be_up in be_ups: + assert isinstance(be_up, dmsg.BEIsUp), 'la_fe received invalid be up' + # TODO: VERIFY WE GET A UNIQUE CUID: keep a set of seen cuids. + # After attaching, get the cuid and compare it against the set + # of already seen cuids. Throw an exception if already seen. + # Delete the set after this loop. + log.debug(f'received descriptor: {be_up.be_ch_desc} and host_id: {be_up.host_id}') + for key, node_desc in net_conf.items(): + if str(be_up.host_id) == str(node_desc.host_id): + forwarding[key] = NodeDescriptor(host_id=int(node_desc.host_id), + ip_addrs=node_desc.ip_addrs, + overlay_cd=be_up.be_ch_desc) + break + + # Send out the FENodeIdx to the child nodes I own + conn_outs = {} # key is the node_index and value is the Connection object + fe_node_idx = dmsg.FENodeIdxBE(tag=dlutil.next_tag(), + node_index=0, + forward=forwarding, + send_desc=frontend_sdesc) + log.debug(f'fanout = {this_process.overlay_fanout}') + for idx in range(this_process.overlay_fanout): + if idx < self.nnodes: + try: + be_sdesc = B64.from_str(forwarding[str(idx)].overlay_cd) + be_ch = Channel.attach(be_sdesc.decode(), mem_pool=self.fe_mpool) + conn_options = ConnectionOptions(default_pool=self.fe_mpool, min_block_size=2 ** 16) + conn_out = Connection(outbound_initializer=be_ch, + options=conn_options, + policy=conn_policy) + conn_out.ghost = True + + # Update the node index to the one we're talking to + fe_node_idx.node_index = idx + log.debug(f'sending {fe_node_idx.uncompressed_serialize()}') + conn_out.send(fe_node_idx.serialize()) + + conn_outs[idx] = conn_out + + except ChannelError as ex: + log.fatal(f'could not connect to BE channel with host_id {be_up.host_id}') + raise RuntimeError('Connection with BE failed') from ex + else: + break + + log.info('sent all FENodeIdxBE msgs') + + return conn_outs + def run_startup(self): """Complete bring up of runtime services """ @@ -899,42 +953,22 @@ def run_startup(self): be_ups = [dlutil.get_with_blocking(self.la_fe_stdin) for _ in range(nnodes)] assert len(be_ups) == self.nnodes - self.conn_outs = {} # key is the node_index and value is the Connection object - for be_up in be_ups: - assert isinstance(be_up, dmsg.BEIsUp), 'la_fe received invalid be up' - log.debug(f'received descriptor: {be_up.be_ch_desc} and host_id: {be_up.host_id}') - try: - # TODO: VERIFY WE GET A UNIQUE CUID: keep a set of seen cuids. - # After attaching, get the cuid and compare it against the set - # of already seen cuids. Throw an exception if already seen. - # Delete the set after this loop. - be_sdesc = B64.from_str(be_up.be_ch_desc) - be_ch = Channel.attach(be_sdesc.decode(), mem_pool=self.fe_mpool) - conn_options = ConnectionOptions(default_pool=self.fe_mpool, min_block_size=2 ** 16) - conn_out = Connection(outbound_initializer=be_ch, - options=conn_options, - policy=conn_policy) - conn_out.ghost = True - except ChannelError as ex: - log.fatal(f'could not connect to BE channel with host_id {be_up.host_id}') - raise RuntimeError('Connection with BE failed') from ex - - # Send node_index to backend - FENodeIdxBE msg - for key, node_desc in net_conf.items(): - if str(be_up.host_id) == str(node_desc.host_id): - fe_node_idx = dmsg.FENodeIdxBE(tag=dlutil.next_tag(), node_index=key) - conn_out.send(fe_node_idx.serialize()) - self.conn_outs[int(key)] = conn_out - log.debug(f"this is node_index: {key} with host_id: {be_up.host_id}") - break + # Construct the number of backend connections based on + # the hierarchical bcast info and send FENodeIdxBE to those + # nodes + log.info(f'received {nnodes} BEIsUp msgs') + self.conn_outs = self.construct_bcast_tree(net_conf, + conn_policy, + be_ups, + encoded_inbound_str) del be_ups - log.info(f'received {nnodes} BEIsUp msgs and sent {nnodes} FENodeIdxBE msgs') - chs_up = [dlutil.get_with_blocking(self.la_fe_stdin) for _ in range(nnodes)] + chs_up = [dlutil.get_with_blocking(self.la_fe_stdin) for _ in range(self.nnodes)] for ch_up in chs_up: assert isinstance(ch_up, dmsg.SHChannelsUp), 'la_fe received invalid channel up' log.info(f'received {nnodes} SHChannelsUP msgs') + nodes_desc = {ch_up.idx: ch_up.node_desc for ch_up in chs_up} gs_cds = [ch_up.gs_cd for ch_up in chs_up if ch_up.gs_cd is not None] if len(gs_cds) == 0: print('The Global Services CD was not returned by any of the SHChannelsUp messages. Launcher Exiting.') @@ -952,14 +986,25 @@ def run_startup(self): # will have one NIC per node and one gateway per node used # (for now anyway - eventually the number of gateways may not # be tied to multiple NIC support). - #FIXME-MULTI-NIC: force one nic per node for now - if (min_nics_per_node > 1) and self.pals_lib_present and (self.transport is dfacts.TransportAgentOptions.HSTA) and False: + if (min_nics_per_node > 1) and self.pals_lib_present and (self.transport is dfacts.TransportAgentOptions.HSTA): num_gw_channels = min_nics_per_node else: num_gw_channels = 1 + if 'DRAGON_HSTA_FABRIC_BACKEND' in os.environ: + if os.environ['DRAGON_HSTA_FABRIC_BACKEND'] != 'ofi_rma': + num_gw_channels = 1 + else: + # TODO: default fabric backend + # assume default fabric backend is ofi_p2p for now + num_gw_channels = 1 + + # HSTA uses NUM_GW_TYPES gateways per agent + if self.transport is dfacts.TransportAgentOptions.HSTA: + num_gw_channels *= dfacts.NUM_GW_TYPES + # Send LAChannelsInfo in a test environment or to all - la_ch_info = dmsg.LAChannelsInfo(tag=dlutil.next_tag(), nodes_desc=chs_up, + la_ch_info = dmsg.LAChannelsInfo(tag=dlutil.next_tag(), nodes_desc=nodes_desc, gs_cd=gs_cd, num_gw_channels=num_gw_channels, port=self.port, transport=str(self.transport)) @@ -1097,6 +1142,7 @@ def probe_teardown(self): def handle_sh_fwd_output(self, msg: dmsg.SHFwdOutput): msg_out = self.build_stdmsg(msg, self.args_map, msg.fd_num == dmsg.SHFwdOutput.FDNum.STDOUT.value) + self.msg_log.debug(f'{msg}') print(msg_out, end="") @route(dmsg.LAExit, _DTBL) @@ -1120,9 +1166,13 @@ def handle_gs_head_exit(self, msg: dmsg.GSHeadExit): @route(dmsg.GSProcessCreateResponse, _DTBL) def handle_gs_proc_create_response(self, msg: dmsg.GSProcessCreateResponse): - self.msg_log.info('The GSProcessCreateResponse was received in the launcher front end') - self._gs_process_create_resp_received = True - self.probe_teardown() + if dmsg.GSProcessCreateResponse.Errors.SUCCESS == msg.err: + self._gs_process_create_resp_received = True + self.msg_log.info('The GSProcessCreateResponse was received in the launcher front end') + self.probe_teardown() + elif dmsg.GSProcessCreateResponse.Errors.FAIL == msg.err: + self.msg_log.warning(f'Unable to start the head process {msg.err_info}') + self._cleanup_abnormal_state(sigint=False) @route(dmsg.SHProcessCreateResponse, _DTBL) def handle_sh_proc_create_response(self, msg: dmsg.SHProcessCreateResponse): diff --git a/src/dragon/launcher/launch_selector.py b/src/dragon/launcher/launch_selector.py index 1064b98..09944fa 100644 --- a/src/dragon/launcher/launch_selector.py +++ b/src/dragon/launcher/launch_selector.py @@ -1,5 +1,6 @@ import os import re +import sys import shutil @@ -156,8 +157,8 @@ def main(): from dragon import _patch_multiprocessing _patch_multiprocessing() dl = get_launcher() - dl.main() + return dl.main() if __name__ == '__main__': - main() + sys.exit(main()) diff --git a/src/dragon/launcher/network_config.py b/src/dragon/launcher/network_config.py index 01c086b..33390a0 100644 --- a/src/dragon/launcher/network_config.py +++ b/src/dragon/launcher/network_config.py @@ -4,6 +4,8 @@ import json import enum import zlib +import signal +import sys from typing import Optional from base64 import b64encode, b64decode @@ -307,7 +309,7 @@ def deliver_backend_node_descriptor(network_prefix=DEFAULT_TRANSPORT_NETIF, raise RuntimeError("Unable to acquire backend network configuration") # Dump to stdout - print(json.dumps(node_info.get_sdict())) + print(json.dumps(node_info.get_sdict()), flush=True) def get_args(inputs=None): diff --git a/src/dragon/launcher/wlm/base.py b/src/dragon/launcher/wlm/base.py index 259d979..cf74d40 100644 --- a/src/dragon/launcher/wlm/base.py +++ b/src/dragon/launcher/wlm/base.py @@ -1,3 +1,4 @@ +import os from abc import ABC, abstractmethod import json import logging @@ -6,8 +7,9 @@ from enum import Enum from shlex import quote -from dragon.infrastructure.node_desc import NodeDescriptor -from dragon.infrastructure.util import NewlineStreamWrapper +from ...infrastructure.parameters import this_process +from ...infrastructure.node_desc import NodeDescriptor +from ...infrastructure.util import NewlineStreamWrapper class NetworkConfigState(Enum): @@ -20,7 +22,9 @@ class NetworkConfigState(Enum): class BaseNetworkConfig(ABC): - def __init__(self, network_prefix, port, nnodes): + + def __init__(self, wlm, network_prefix, port, nnodes): + self.wlm = wlm self.NNODES = nnodes self.NETWORK_CFG_HELPER_LAUNCH_CMD = [ "dragon-network-config-launch-helper", @@ -34,8 +38,12 @@ def __init__(self, network_prefix, port, nnodes): ] + self.NET_CONF_CACHE = this_process.net_conf_cache + self.LOGGER = logging.getLogger("NetworkConfig") + self.node_descriptors = {} + self._sigint_trigger = None self._sigint_triggered = False self.config_helper = None @@ -48,35 +56,50 @@ def check_for_wlm_support(cls) -> bool: raise NotImplementedError @abstractmethod - def _launch_network_config_helper(self) -> subprocess.Popen: + def _get_wlm_job_id(self) -> str: raise NotImplementedError - def _parse_network_configuration(self, stdout) -> None: - self.node_descriptors = {} + @abstractmethod + def _launch_network_config_helper(self) -> subprocess.Popen: + raise NotImplementedError - stdout_stream = NewlineStreamWrapper(stdout) + def _parse_network_configuration(self) -> None: + last_node_descriptor_count = 0 + stdout_stream = NewlineStreamWrapper(self.config_helper.stdout) + stderr_stream = NewlineStreamWrapper(self.config_helper.stderr) while len(self.node_descriptors.keys()) != self.NNODES: - if stdout_stream.poll(): - lines = [] - while stdout_stream.poll(): - line = stdout_stream.recv() - # sattach returns an empty string if nothing to report. ignore - if line == "": - break - else: - lines.append(line) - - for line in lines: - self.LOGGER.debug(f'{line=}') - if line != "": - node_index, node_desc = line.split(": ", maxsplit=1) - if " " in node_index: - node_index = node_index.split(" ")[-1] - if str(node_index) not in self.node_descriptors.keys(): - self.LOGGER.debug(json.loads(node_desc)) - self.node_descriptors[ - str(node_index) - ] = NodeDescriptor.from_sdict(json.loads(node_desc)) + + lines = [] + node_descriptor_count = len(self.node_descriptors.keys()) + if last_node_descriptor_count != node_descriptor_count: + self.LOGGER.debug(f'received {node_descriptor_count} of {self.NNODES} expected NodeDescriptors') + last_node_descriptor_count = node_descriptor_count + + if self.config_helper.poll(): # Is the helper process still running? + if self.config_helper.returncode != 0: # Did the helper process exit with non-zero error code? + out, err = self.config_helper.communicate() + raise RuntimeError(str(err)) + + while stdout_stream.poll(): + line = stdout_stream.recv() + # sattach returns an empty string if nothing to report. ignore + if line == "": + break + else: + lines.append(line) + + for line in lines: + self.LOGGER.debug(f'{line=}') + node_index, node_desc = line.split(": ", maxsplit=1) + if " " in node_index: + node_index = node_index.split(" ")[-1] + if str(node_index) not in self.node_descriptors.keys(): + self.LOGGER.debug(json.loads(node_desc)) + self.node_descriptors[ + str(node_index) + ] = NodeDescriptor.from_sdict(json.loads(node_desc)) + + self.LOGGER.debug(f'received {self.NNODES} of {self.NNODES} expected NodeDescriptors') def _sigint_teardown(self): """Safely teardown network config infrastructure""" @@ -99,6 +122,45 @@ def _sigint_handler(self, *args): self._state == NetworkConfigState.CONFIG_DONE: self._sigint_teardown() + @abstractmethod + def _supports_net_conf_cache(self) -> bool: + raise NotImplementedError + + def load_net_conf_cache(self): + """ + """ + if os.path.isfile(self.NET_CONF_CACHE): + try: + with open(self.NET_CONF_CACHE, 'r') as inf: + data = json.load(inf) + + if data['wlm'] == self.wlm and \ + data['job_id'] == self._get_wlm_job_id(): + self.LOGGER.debug("Loading cached network data") + for node_index, node_desc in data['nodes'].items(): + self.node_descriptors[ + str(node_index) + ] = NodeDescriptor.from_sdict(node_desc) + return + + except (ValueError, json.JSONDecodeError): + pass + + # Remove old cached data that doesn't match our current job + os.remove(self.NET_CONF_CACHE) + + def save_net_conf_cache(self) -> None: + with open(self.NET_CONF_CACHE, 'w', encoding='utf-8') as outf: + data = { + 'wlm': self.wlm, + 'job_id': self._get_wlm_job_id(), + 'nodes': { + node_id: node_desc.get_sdict() + for node_id, node_desc in self.node_descriptors.items() + } + } + json.dump(data, outf, ensure_ascii=False, indent=4) + def get_network_config(self, sigint_trigger=None) -> map: try: @@ -106,30 +168,40 @@ def get_network_config(self, sigint_trigger=None) -> map: self._state = NetworkConfigState.IN_PROGRESS - self.LOGGER.debug("Launching config helper.") - self.config_helper = self._launch_network_config_helper() + if self._supports_net_conf_cache(): + self.load_net_conf_cache() - if sigint_trigger == -2: - signal.raise_signal(signal.SIGINT) + # if we weren't able to load a cached network conf, + # launch the config helper + if not self.node_descriptors: + self.LOGGER.debug("Launching config helper.") + self.config_helper = self._launch_network_config_helper() + + if sigint_trigger == -2: + signal.raise_signal(signal.SIGINT) + + self.LOGGER.debug("Parsing configuration data.") + self._parse_network_configuration() + + self.LOGGER.debug("Waiting for config helper to exit.") + self.config_helper.wait() - self.LOGGER.debug("Waiting for config helper to exit.") - self.config_helper.wait() - if self.config_helper.returncode != 0: - out, err = self.config_helper.communicate() - raise RuntimeError(str(err)) + if self.config_helper.returncode != 0 and self.config_helper.returncode != -9: + out, err = self.config_helper.communicate() + raise RuntimeError(str(err)) - self.LOGGER.debug("Parsing configuration data.") - self._parse_network_configuration(self.config_helper.stdout) + self.LOGGER.debug("Closing config helper's stdout handle.") + self.config_helper.stdout.close() + self.config_helper.stderr.close() - self.LOGGER.debug("Closing config helper's stdout handle.") - self.config_helper.stdout.close() - self.config_helper.stderr.close() + if self._supports_net_conf_cache(): + self.save_net_conf_cache() if sigint_trigger == -1: signal.raise_signal(signal.SIGINT) self._state = NetworkConfigState.CONFIG_DONE - self.LOGGER.debug("All child procs exited. Returning output") + self.LOGGER.debug("Completed gathering Network Configuration") if self._sigint_triggered: self._sigint_teardown() diff --git a/src/dragon/launcher/wlm/pbs_pals.py b/src/dragon/launcher/wlm/pbs_pals.py index 544be14..ff2dd36 100644 --- a/src/dragon/launcher/wlm/pbs_pals.py +++ b/src/dragon/launcher/wlm/pbs_pals.py @@ -32,11 +32,13 @@ def __init__(self, network_prefix, port, hostlist): raise RuntimeError(msg) super().__init__( + 'pbs+pals', network_prefix, port, get_nodefile_node_count(os.environ.get("PBS_NODEFILE")), ) + self.job_id = os.environ.get("PBS_JOBID") self.MPIEXEC_ARGS = self.MPIEXEC_COMMAND_LINE.format(nnodes=self.NNODES).split() @classmethod @@ -45,6 +47,12 @@ def check_for_wlm_support(cls) -> bool: return re.match('.*/pals/.*', mpiexec) return False + def _get_wlm_job_id(self) -> str: + return self.job_id + + def _supports_net_conf_cache(self) -> bool: + return False + def _launch_network_config_helper(self) -> subprocess.Popen: mpiexec_launch_args = self.MPIEXEC_ARGS[:] mpiexec_launch_args.append("--line-buffer") diff --git a/src/dragon/launcher/wlm/slurm.py b/src/dragon/launcher/wlm/slurm.py index df488c8..1ed392c 100644 --- a/src/dragon/launcher/wlm/slurm.py +++ b/src/dragon/launcher/wlm/slurm.py @@ -13,17 +13,18 @@ def get_slurm_launch_be_args(args_map=None): class SlurmNetworkConfig(BaseNetworkConfig): - SRUN_COMMAND_LINE = "srun --nodes={nnodes} --ntasks={nnodes} --cpu_bind=none -u -l" + SRUN_COMMAND_LINE = "srun --nodes={nnodes} --ntasks={nnodes} --cpu_bind=none -u -l -W 0" def __init__(self, network_prefix, port, hostlist): - if not os.environ.get("SLURM_JOB_ID"): + self.job_id = os.environ.get("SLURM_JOB_ID") + if not self.job_id: msg = """Requesting a slurm network config outside of slurm job allocation. Resubmit as part of a 'salloc' or 'sbatch' execution.""" raise RuntimeError(msg) super().__init__( - network_prefix, port, int(os.environ.get("SLURM_JOB_NUM_NODES")) + 'slurm', network_prefix, port, int(os.environ.get("SLURM_JOB_NUM_NODES")) ) self.SRUN_ARGS = self.SRUN_COMMAND_LINE.format(nnodes=self.NNODES).split() @@ -32,6 +33,12 @@ def __init__(self, network_prefix, port, hostlist): def check_for_wlm_support(cls) -> bool: return shutil.which("srun") + def _get_wlm_job_id(self) -> str: + return self.job_id + + def _supports_net_conf_cache(self) -> bool: + return False + def _launch_network_config_helper(self) -> subprocess.Popen: srun_launch_args = self.SRUN_ARGS[:] srun_launch_args.extend(self.NETWORK_CFG_HELPER_LAUNCH_CMD) diff --git a/src/dragon/launcher/wlm/ssh.py b/src/dragon/launcher/wlm/ssh.py index 5692041..052c1f8 100644 --- a/src/dragon/launcher/wlm/ssh.py +++ b/src/dragon/launcher/wlm/ssh.py @@ -53,7 +53,7 @@ def get_ssh_launch_be_args(hostname=None, args_map=None) -> str: if ENV_VARS is None or args_map is not None: get_ssh_env_vars(args_map=args_map) - be_args = " ".join(["ssh -oBatchMode=yes {hostname}", f"cd {getcwd()} &&"] + ENV_VARS) + be_args = " ".join(["ssh -oBatchMode=yes {hostname}", f"/bin/bash -c cd {getcwd()} &&"] + ENV_VARS) return be_args @@ -241,7 +241,7 @@ class SSHNetworkConfig(BaseNetworkConfig): def __init__(self, network_prefix, port, hostlist): super().__init__( - network_prefix, port, len(hostlist) + 'ssh', network_prefix, port, len(hostlist) ) self.hostlist = hostlist @@ -249,6 +249,9 @@ def __init__(self, network_prefix, port, hostlist): def check_for_wlm_support(cls) -> bool: return shutil.which("ssh") + def _supports_net_conf_cache(self) -> bool: + return False + def _launch_network_config_helper(self) -> subprocess.Popen: popen_dict = {} diff --git a/src/dragon/localservices/local_svc.py b/src/dragon/localservices/local_svc.py index e0e05e6..2b19e37 100644 --- a/src/dragon/localservices/local_svc.py +++ b/src/dragon/localservices/local_svc.py @@ -13,7 +13,6 @@ from ..infrastructure.node_desc import NodeDescriptor from ..transport import start_transport_agent -from ..launcher.launchargs import TransportAgentOptions from ..infrastructure import connection as dconn from ..infrastructure import facts as dfacts @@ -66,7 +65,7 @@ def maybe_start_gs(gs_args: Optional[list], gs_env: Optional[dict], hostname: st ProcessProps(p_uid=dfacts.GS_PUID, critical=True, r_c_uid=None, stdin_req=None, stdout_req=None, stderr_req=None, stdin_connector=None, stdout_connector=stdout_connector, - stderr_connector=stderr_connector), + stderr_connector=stderr_connector, layout=None), gs_args, bufsize=0, stdin=subprocess.PIPE, @@ -218,10 +217,13 @@ def single(make_infrastructure_resources: bool = True, assert isinstance(msg, dmsg.BEPingSH), 'startup expectation on shep input' log.info('got BEPingSH') - ch_up_msg = dmsg.SHChannelsUp(tag=get_new_tag(), host_name='localhost', - host_id=dutils.host_id(), ip_addrs=['127.0.0.1'], - shep_cd=dparms.this_process.local_shep_cd, + ls_node_desc = NodeDescriptor.get_localservices_node_conf(host_name='localhost', + name='localhost', + ip_addrs=['127.0.0.1']) + ch_up_msg = dmsg.SHChannelsUp(tag=get_new_tag(), + node_desc=ls_node_desc, gs_cd=dparms.this_process.gs_cd) + la_input.send(ch_up_msg.serialize()) log.info('sent SHChannelsUp') gs_proc = maybe_start_gs(gs_args, gs_env, hostname='localhost', be_in=la_input) @@ -324,9 +326,14 @@ def multinode(make_infrastructure_resources: bool = True, assert isinstance(msg, dmsg.BEPingSH), 'startup expectation on shep input' log.info('got BEPingSH') - ch_up_msg = dmsg.SHChannelsUp(tag=get_new_tag(), host_name=hostname, host_id=dutils.host_id(), - ip_addrs=ip_addrs, shep_cd=dparms.this_process.local_shep_cd, - gs_cd=gs_cd, idx=node_index) + # Create a node descriptor for this node I'm running on + ls_node_desc = NodeDescriptor.get_localservices_node_conf(host_name=hostname, + name=hostname, + ip_addrs=ip_addrs) + ch_up_msg = dmsg.SHChannelsUp(tag=get_new_tag(), + node_desc=ls_node_desc, + gs_cd=gs_cd, + idx=node_index) la_input.send(ch_up_msg.serialize()) log.info('sent SHChannelsUp') @@ -400,7 +407,8 @@ def multinode(make_infrastructure_resources: bool = True, stderr_connector=None, stdin_req=None, stdout_req=None, - stderr_req=None + stderr_req=None, + layout=None ) except Exception as e: logging.getLogger(dls.LS).getChild('start_ta').fatal(f'transport agent launch failed on {node_index}') diff --git a/src/dragon/localservices/server.py b/src/dragon/localservices/server.py index 20cce88..ebc7229 100644 --- a/src/dragon/localservices/server.py +++ b/src/dragon/localservices/server.py @@ -40,7 +40,8 @@ def get_new_tag(): ProcessProps = collections.namedtuple('ProcessProps', ['p_uid', 'critical', 'r_c_uid', 'stdin_req', 'stdout_req', 'stderr_req', - 'stdin_connector', 'stdout_connector', 'stderr_connector']) + 'stdin_connector', 'stdout_connector', 'stderr_connector', + 'layout']) class PopenProps(subprocess.Popen): @@ -48,9 +49,17 @@ def __init__(self, props: ProcessProps, *args, **kwds): assert isinstance(props, ProcessProps) super().__init__(*args, **kwds) self.props = props - # TODO Add affinity control to the process options. To be on the safe - # TODO side for now, open affinity to all cores. + + # Assuming this is basically a free call, default the afinity to "everything" just in case os.sched_setaffinity(self.pid, range(os.cpu_count())) + if props.layout is not None: + if props.layout.cpu_core: + os.sched_setaffinity(self.pid, props.layout.cpu_core) + + # gpu_core list must be turned into a string in the form "0,1,2" etc + if props.layout.gpu_core and props.layout.accelerator_env: + os.environ[props.layout.accelevator_env] = ",".join(str(core) for core in props.layout.gpu_core) + # XXX Affinity settings are only inherited by grandchild processes # XXX created after this point in time. Any grandchild processes # XXX started when the child process starts up most certainly will @@ -848,32 +857,36 @@ def create_process(self, msg: dmsg.SHProcessCreate) -> None: the_env['DRAGON_PMOD_CHILD_CHANNEL'] = str(dutils.B64(pmod_launch_ch.serialize())) log.info(f'p_uid {msg.t_p_uid} Setting required PMI environment variables') + # For PBS, we need to tell PMI to not use a FD to get PALS info: + try: + del the_env['PMI_CONTROL_FD'] + except KeyError: + pass + the_env['PMI_CONTROL_PORT'] = str(msg.pmi_info.control_port) the_env['MPICH_OFI_CXI_PID_BASE'] = str(msg.pmi_info.pid_base) the_env['DL_PLUGIN_RESILIENCY'] = "1" the_env['LD_PRELOAD'] = 'libdragon.so' the_env['_DRAGON_PALS_ENABLED'] = '1' the_env['FI_CXI_RX_MATCH_MODE'] = 'hybrid' - # the_env['DRAGON_DEBUG'] = '1' - # the_env['PMI_DEBUG'] = '1' stdin_connector = InputConnector(stdin_conn) - stdout_connector = OutputConnector(be_in = self.be_in, puid=msg.t_p_uid, - hostname=self.hostname, out_err=dmsg.SHFwdOutput.FDNum.STDOUT.value, - conn=stdout_conn, root_proc=stdout_root, critical_proc=False) + stdout_connector = OutputConnector(be_in=self.be_in, puid=msg.t_p_uid, + hostname=self.hostname, out_err=dmsg.SHFwdOutput.FDNum.STDOUT.value, + conn=stdout_conn, root_proc=stdout_root, critical_proc=False) - stderr_connector = OutputConnector(be_in = self.be_in, puid=msg.t_p_uid, - hostname=self.hostname, out_err=dmsg.SHFwdOutput.FDNum.STDERR.value, - conn=stderr_conn, root_proc=stderr_root, critical_proc=False) + stderr_connector = OutputConnector(be_in=self.be_in, puid=msg.t_p_uid, + hostname=self.hostname, out_err=dmsg.SHFwdOutput.FDNum.STDERR.value, + conn=stderr_conn, root_proc=stderr_root, critical_proc=False) with self.apt_lock: # race with death watcher; hold lock to get process in table. # The stdout_conn and stderr_conn will be filled in just below. the_proc = PopenProps( ProcessProps(p_uid=msg.t_p_uid, critical=False, r_c_uid=msg.r_c_uid, - stdin_req=msg.stdin, stdout_req=msg.stdout, stderr_req=msg.stderr, - stdin_connector=stdin_connector, stdout_connector=stdout_connector, - stderr_connector=stderr_connector), + stdin_req=msg.stdin, stdout_req=msg.stdout, stderr_req=msg.stderr, + stdin_connector=stdin_connector, stdout_connector=stdout_connector, + stderr_connector=stderr_connector, layout=msg.layout), real_args, bufsize=0, stdin=subprocess.PIPE, diff --git a/src/dragon/mpbridge/pool.py b/src/dragon/mpbridge/pool.py index 1dcb97f..1c5a3ed 100644 --- a/src/dragon/mpbridge/pool.py +++ b/src/dragon/mpbridge/pool.py @@ -1,10 +1,10 @@ """Dragon's replacement for Multiprocessing Pool. By default this uses a patched version of the dragon native pool and sets -`DRAGON_BASEPOOL="NATIVE"`. The private api for this class is still under -development. To revert to the version based on the `multiprocessing.Pool` class -with a patched terminate_pool method, set `DRAGON_BASEPOOL="PATCHED"` in the -environment. +`DRAGON_BASEPOOL="NATIVE"`. The private api for this class is still under +development. To revert to the version based on the `multiprocessing.Pool` class +with a patched terminate_pool method, set `DRAGON_BASEPOOL="PATCHED"` in the +environment. """ import multiprocessing.pool from multiprocessing import get_start_method @@ -15,6 +15,7 @@ from ..native.pool import Pool as NativePool from ..native.pool import job_counter, mapstar, ApplyResult, MapResult +from ..native.process import Process from ..native.process_group import ProcessGroup from ..globalservices.process import multi_join @@ -23,6 +24,7 @@ import itertools import time import threading +import signal import os @@ -125,6 +127,86 @@ class DragonPoolPatched(multiprocessing.pool.Pool): # Dummy def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) +class WrappedDragonProcess: # Dummy + + def __init__(self, process, ident): + self._puid = ident + if process is None: + self._process = Process(None, ident=self._puid) + + def start(self) -> None: + """Start the process represented by the underlying process object.""" + self._process.start() + + def is_alive(self) -> bool: + """Check if the process is still running + + :return: True if the process is running, False otherwise + :rtype: bool + """ + return self._process.is_alive + + def join(self, timeout: float = None) -> int: + """Wait for the process to finish. + + :param timeout: timeout in seconds, defaults to None + :type timeout: float, optional + :return: exit code of the process, None if timeout occurs + :rtype: int + :raises: ProcessError + """ + return self._process.join() + + def terminate(self) -> None: + """Send SIGTERM signal to the process, terminating it. + + :return: None + :rtype: NoneType + """ + self._process.terminate() + + def kill(self) -> None: + """Send SIGKILL signal to the process, killing it. + + :return: None + :rtype: NoneType + """ + self._process.kill() + + @property + def pid(self): + """Process puid. Globally unique""" + return self._puid + + @property + def name(self) -> str: + """gets serialized descriptors name for the process + + :return: serialized descriptor name of process + :rtype: str + """ + return self._process.name + + @property + def exitcode(self) -> int: + """When the process has terminated, return exit code. None otherwise.""" + return self._process.returncode + + @property + def sentinel(self): + raise NotImplementedError + + @property + def authkey(self): + raise NotImplementedError + + @property + def daemon(self): + raise NotImplementedError + + @property + def close(self): + raise NotImplementedError class WrappedResult: """Returned by all functions that return a result. Wraps ApplyResult and MapResult so that correct timeout error can be raised.""" @@ -177,32 +259,38 @@ def wait(self, timeout: float = None) -> None: self._result.wait(timeout) -class WrappedPG: - """Wraps a ProcessGroup and makes the interface compliant with how self._pool is used in the multiprocessing pool unittests.""" - - def __init__(self, process_group: ProcessGroup = None): - """Initializes wrapped process group - - :param process_group: ProcessGroup used by native pool implementation, defaults to None - :type process_group: ProcessGroup, optional - """ - self._pg = process_group - - def __len__(self) -> int: - """Returns the number of processes in the process group - - :return: the number of processes in the process group - :rtype: int - """ - return self._pg.nproc - - class DragonPool(NativePool): """Dragon's replacement for Multiprocessing Pool.""" def __init__(self, *args, context=None, **kwargs): super().__init__(*args, **kwargs) - self._pool = WrappedPG(self._pg) + + @property + def _pool(self): + puids = self._pg.puids + pool_procs = [] + # need to wait until all procs are up. + while None in self._pg.puids: + time.sleep(0.1) + # add a wrapped proc that has an interface like what mp is expecting + for puid in puids: + pool_procs.append(WrappedDragonProcess(None, ident=puid)) + return pool_procs + + def _repopulate_pool(self): + # repopulate pool by shutting PG down and then starting new PG + if self._close_thread is not None: + raise RuntimeError("Trying to repopulate a pool that was previously closed. This pattern is not supported.") + if not self._pg.status == "Stop": + self._pg.kill(signal.SIGTERM) + self._pg.join() + self._pg.stop() + + self._pg = ProcessGroup(restart=True, ignore_error_on_exit=True) + self._pg.add_process(self._processes, self._template) + self._pg.init() + self._pg.start() + def apply_async( self, @@ -272,7 +360,7 @@ def apply(self, func: callable, args: tuple = (), kwds: dict = {}) -> Any: :return: The result of `func(*args, **kwargs)` :rtype: """ - return WrappedResult(self.apply_async(func, args, kwds)).get() + return self.apply_async(func, args, kwds).get() def map(self, func: callable, iterable: Iterable, chunksize: int = None) -> Iterable[Any]: """Apply `func` to each element in `iterable`, collecting the results @@ -287,7 +375,7 @@ def map(self, func: callable, iterable: Iterable, chunksize: int = None) -> Iter :return: list of results from applying `func` to each element of input iterable :rtype: Iterable[Any] """ - return WrappedResult(self.map_async(func, iterable, chunksize)).get() + return self.map_async(func, iterable, chunksize).get() def starmap(self, func: callable, iterable: Iterable[Iterable], chunksize: int = None) -> Iterable[Any]: """Like `map()` method but the elements of the `iterable` are expected to be iterables as well and will be unpacked as arguments. Hence `func` and (a, b) becomes func(a, b). diff --git a/src/dragon/native/pool.py b/src/dragon/native/pool.py index b6bcf73..068884c 100644 --- a/src/dragon/native/pool.py +++ b/src/dragon/native/pool.py @@ -1,4 +1,4 @@ -"""The Dragon native pool manages a pool of child processes. +"""The Dragon native pool manages a pool of child processes. """ from __future__ import annotations @@ -240,6 +240,8 @@ def __init__( initargs: tuple = (), maxtasksperchild: int = None, policy: Policy = None, + *args, + **kwargs, ): """Init method @@ -426,7 +428,6 @@ def handler(signum, frame): completed_tasks += 1 LOGGER.debug(f"{myp.ident} returning from worker_function") - @classmethod def _handle_results(cls, outqueue, cache, end_event): diff --git a/src/dragon/native/process_group.py b/src/dragon/native/process_group.py index 1cbebe3..2ba0af2 100644 --- a/src/dragon/native/process_group.py +++ b/src/dragon/native/process_group.py @@ -34,10 +34,18 @@ from .lock import Lock from .machine import current as current_node -from ..globalservices.process import multi_join, kill as process_kill, get_create_message, get_create_message_with_argdata, query as process_query +from ..globalservices.process import ( + multi_join, + kill as process_kill, + get_create_message, + get_create_message_with_argdata, + query as process_query, +) from ..globalservices.group import create, kill as group_kill, add_to, create_add_to, remove_from, destroy from ..infrastructure.policy import Policy +from ..infrastructure.policy import Policy, GS_DEFAULT_POLICY + LOG = logging.getLogger(__name__) # exit code returned by cython for sigterm @@ -127,7 +135,7 @@ def run(self, prior_state: BaseState, signal: PGSignals, sig_id: int) -> None: class Idle(BaseState): """This state kills existing processes and does nothing otherwise.""" - forbidden = [PGSignals.SHUTDOWN, PGSignals.KILL] + forbidden = [PGSignals.KILL] def run(self, prior_state: BaseState, signal: PGSignals, sig_id: int) -> None: """The idle state just does nothing except making sure all processes are gone.""" @@ -180,7 +188,9 @@ def run(self, prior_state: BaseState, signal: PGSignals, sig_id: int) -> None: puids = [descr.uid for lst in group_descr.sets for descr in lst] for new_puid in puids: if new_puid not in self.context.puid_to_message_map: - self.context.puid_to_message_map[new_puid] = self.context.puid_to_message_map[puid] + self.context.puid_to_message_map[new_puid] = self.context.puid_to_message_map[ + puid + ] # since we added only one process, we can safely assume that we found it break break @@ -249,8 +259,10 @@ def run(self, prior_state: BaseState, signal: PGSignals, sig_id: int) -> None: destroy(self.context.guid) self.context.guid = self.context._group_descr = None + # end concrete state classes + class GroupContext: """The Context defines the group interface for the manager and the client. In particular, it handles signals and state changes. It maintains a @@ -318,18 +330,37 @@ def __init__( """ self.nproc = nproc - self.templates = templates # list of tuples - self.messages = {} # keys are the indices of tuples in self.templates + self.templates = templates # list of tuples + self.messages = {} # keys are the indices of tuples in self.templates # use a dict to make restarting easy and order-safe - self.puid_to_message_map = {} # keys are the puids, values are the keys in self.messages{} + self.puid_to_message_map = {} # keys are the puids, values are the keys in self.messages{} for i, tup in enumerate(templates): t = tup[1] if t.is_python: - self.messages[i] = get_create_message_with_argdata(t.target, t.cwd, t.args, t.env, t.argdata, pmi_required=pmi_enabled, stdin=t.stdin, stdout=t.stdout, stderr=t.stderr) + self.messages[i] = get_create_message_with_argdata( + t.target, + t.cwd, + t.args, + t.env, + t.argdata, + pmi_required=pmi_enabled, + stdin=t.stdin, + stdout=t.stdout, + stderr=t.stderr, + ) else: - self.messages[i] = get_create_message(t.target, t.cwd, t.args, t.env, pmi_required=pmi_enabled, stdin=t.stdin, stdout=t.stdout, stderr=t.stderr) + self.messages[i] = get_create_message( + t.target, + t.cwd, + t.args, + t.env, + pmi_required=pmi_enabled, + stdin=t.stdin, + stdout=t.stdout, + stderr=t.stderr, + ) self.guid = None self._group_descr = None @@ -494,7 +525,9 @@ def _start_group_once(self): if not self._start_time: self._start_time = time.monotonic() - group_descr = create([ (tup[0], self.messages[i].serialize()) for i, tup in enumerate(self.templates) ], self.policy) + group_descr = create( + [(tup[0], self.messages[i].serialize()) for i, tup in enumerate(self.templates)], self.policy + ) self._group_descr = group_descr self.guid = group_descr.g_uid @@ -645,7 +678,7 @@ def __init__( ignore_error_on_exit: bool = False, pmi_enabled: bool = False, walltime: float = None, - policy: Policy = None + policy: Policy = None, ): """Instantiate a number of Dragon processes. @@ -663,7 +696,7 @@ def __init__( :type policy: dragon.infrastructure.policy.Policy """ - self.templates = [] # this will be a list of tuples that will be sent to the GSGroup API + self.templates = [] # this will be a list of tuples that will be sent to the GSGroup API self.nproc = 0 self.restart = restart self.ignore_error_on_exit = ignore_error_on_exit @@ -683,7 +716,9 @@ def add_process(self, nproc: int, template: TemplateProcess) -> None: # if add_process is called after the ProcessGroup is initialized, then we raise if self._group_context: - raise DragonProcessGroupError("You cannot call add_process() to already initialized ProcessGroup. Please use ProcessGroup.create_add_to() instead to add more processes.") + raise DragonProcessGroupError( + "You cannot call add_process() to already initialized ProcessGroup. Please use ProcessGroup.create_add_to() instead to add more processes." + ) self.templates.append((nproc, template)) self.nproc += nproc @@ -691,7 +726,15 @@ def add_process(self, nproc: int, template: TemplateProcess) -> None: def init(self) -> None: """Initialize the GroupContext and Manager.""" - self._group_context = GroupContext(self.templates, self.nproc, self.restart, self.ignore_error_on_exit, self.pmi_enabled, self.walltime, self.policy) + self._group_context = GroupContext( + self.templates, + self.nproc, + self.restart, + self.ignore_error_on_exit, + self.pmi_enabled, + self.walltime, + self.policy, + ) self._manager = Manager() self._send_signal(PGSignals.NEW) diff --git a/src/dragon/native/queue.py b/src/dragon/native/queue.py index b1e2f3d..6de5a5a 100644 --- a/src/dragon/native/queue.py +++ b/src/dragon/native/queue.py @@ -518,13 +518,13 @@ def _close(self): release_refcnt(self._channel.cuid) self._channel.detach() except Exception as e: - LOG.debug(f'Could not complete release of refcount or detach. {e=}') + pass # Could not complete release of refcount or detach. elif self._ext_channel and self._unpickled_instance: try: self._channel.detach() # this should be revisited once refcounting fully works as it should not be necessary then except Exception as e: - LOG.debug(f"We couldn't detach from externally managed channel. {e=}") + pass # We couldn't detach from externally managed channel. if self._joinable: try: release_refcnt(self._cnt_channel.cuid) @@ -532,8 +532,7 @@ def _close(self): release_refcnt(self._ev_channel.cuid) self._ev_channel.detach() except Exception as e: - LOG.debug( - f'Joinable queue: there was a problem with releasing channel refcount or while detaching from channel. {e=}') + pass # Joinable queue: there was a problem with releasing channel refcount or while detaching from channel if self._buffer_pool is not None: try: diff --git a/src/dragon/pydragon_channels.pyx b/src/dragon/pydragon_channels.pyx index 2ede86b..d499720 100644 --- a/src/dragon/pydragon_channels.pyx +++ b/src/dragon/pydragon_channels.pyx @@ -1761,7 +1761,7 @@ cdef class Peer2PeerWritingChannelFile: self.flush() derr = dragon_memory_free(&self._small_blk_descr) - if derr != DRAGON_SUCCESS: + if derr != DRAGON_MAP_KEY_NOT_FOUND and derr != DRAGON_SUCCESS: raise ChannelError("Could not free small message send buffer", derr) finally: diff --git a/src/dragon/pydragon_fli.pyx b/src/dragon/pydragon_fli.pyx new file mode 100644 index 0000000..93b3eb2 --- /dev/null +++ b/src/dragon/pydragon_fli.pyx @@ -0,0 +1,541 @@ +from dragon.dtypes_inc cimport * +from dragon.channels cimport * +from dragon.managed_memory cimport * +import dragon.dtypes as dtypes +import dragon.infrastructure.parameters as dparms +import dragon.infrastructure.facts as dfacts +import dragon.globalservices.channel as dgchan +from dragon.localservices.options import ChannelOptions + +BUF_READ = PyBUF_READ +BUF_WRITE = PyBUF_WRITE +DEFAULT_CLOSE_TIMEOUT = 5 +STREAM_CHANNEL_IS_MAIN = 1010 + +cdef enum: + C_TRUE = 1 + C_FALSE = 0 + +cdef timespec_t* _computed_timeout(timeout, timespec_t* time_ptr): + + if timeout is None: + time_ptr = NULL + elif isinstance(timeout, int) or isinstance(timeout, float): + if timeout < 0: + raise ValueError('Cannot provide timeout < 0.') + + # Anything >= 0 means use that as seconds for timeout. + time_ptr.tv_sec = int(timeout) + time_ptr.tv_nsec = int((timeout - time_ptr.tv_sec)*1000000000) + else: + raise ValueError('The timeout value must be a float or int') + + return time_ptr + +class DragonFLIError(Exception): + + def __init__(self, lib_err, msg): + cdef char * errstr = dragon_getlasterrstr() + + self.msg = msg + self.lib_msg = errstr[:].decode('utf-8') + lib_err_str = dragon_get_rc_string(lib_err) + self.lib_err = lib_err_str[:].decode('utf-8') + free(errstr) + + def __str__(self): + return f"FLI Exception: {self.msg}\n*** Dragon C-level Traceback: ***\n{self.lib_msg}\n*** End C-level Traceback: ***\nDragon Error Code: {self.lib_err}" + +class FLIEOT(DragonFLIError, EOFError): + pass + + +cdef class FLISendH: + """ + Sending handle for FLInterfaces + """ + + cdef: + dragonFLISendHandleDescr_t _sendh + dragonFLIDescr_t _adapter + bool _is_open + + def __init__(self, FLInterface adapter, Channel stream_channel=None, timeout=None, use_main_as_stream_channel=False): + cdef: + dragonError_t derr + dragonChannelDescr_t * c_strm_ch = NULL + timespec_t timer + timespec_t* time_ptr + + self._adapter = adapter._adapter + time_ptr = _computed_timeout(timeout, &timer) + + if stream_channel is not None: + c_strm_ch = &stream_channel._channel + + if use_main_as_stream_channel: + c_strm_ch = STREAM_CHANNEL_IS_MAIN_FOR_1_1_CONNECTION + + with nogil: + derr = dragon_fli_open_send_handle(&self._adapter, &self._sendh, c_strm_ch, time_ptr) + + if derr != DRAGON_SUCCESS: + raise DragonFLIError(derr, "Could not open send handle stream.") + + self._is_open = True + + def close(self, timeout=None): + cdef: + dragonError_t derr + timespec_t timer + timespec_t* time_ptr + + if not self._is_open: + return + + time_ptr = _computed_timeout(timeout, &timer) + + with nogil: + derr = dragon_fli_close_send_handle(&self._sendh, time_ptr) + + if derr != DRAGON_SUCCESS: + raise DragonFLIError(derr, "Could not close send handle stream.") + + self._is_open = False + + def __del__(self): + try: + self.close(timeout=DEFAULT_CLOSE_TIMEOUT) + except: + pass + + def send_bytes(self, bytes data, uint64_t arg=0, bool buffer=False, timeout=None): + cdef: + dragonError_t derr + #uint8_t * c_data + size_t num_bytes + timespec_t timer + timespec_t* time_ptr + int data_len + + if self._is_open == False: + raise RuntimeError("Handle not open, cannot send data.") + + time_ptr = _computed_timeout(timeout, &timer) + + cdef const unsigned char[:] c_data = data + data_len = len(data) + arg_val = arg + + with nogil: + derr = dragon_fli_send_bytes(&self._sendh, data_len, &c_data[0], arg, buffer, time_ptr) + if derr != DRAGON_SUCCESS: + raise DragonFLIError(derr, "Failed to send message over stream channel.") + + def send_mem(self, MemoryAlloc mem, uint64_t arg=0, timeout=None): + cdef: + dragonError_t derr + timespec_t timer + timespec_t* time_ptr + + if self._is_open == False: + raise RuntimeError("Handle not open, cannot send data.") + + time_ptr = _computed_timeout(timeout, &timer) + arg_val = arg + + with nogil: + derr = dragon_fli_send_mem(&self._sendh, &mem._mem_descr, arg, time_ptr) + if derr != DRAGON_SUCCESS: + raise DragonFLIError(derr, "Failed to send memory over stream channel.") + + def create_fd(self, bool buffered=False, size_t chunk_size=0, arg=0, timeout=None): + """ + Opens a writable file-descriptor and returns it. + """ + cdef: + dragonError_t derr + int fdes + timespec_t timer + timespec_t* time_ptr + uint64_t user_arg + + if self._is_open == False: + raise RuntimeError("Handle not open, cannot get a file descriptor.") + + time_ptr = _computed_timeout(timeout, &timer) + user_arg = arg + + with nogil: + derr = dragon_fli_create_writable_fd(&self._sendh, &fdes, buffered, chunk_size, user_arg, time_ptr) + if derr != DRAGON_SUCCESS: + raise DragonFLIError(derr, "Could not open writeable file descriptor.") + + return fdes + + def finalize_fd(self): + """ + Flushes a file-descriptor and waits until all buffers are written and the + file descriptor is closed. + """ + cdef: + dragonError_t derr + + if self._is_open == False: + raise RuntimeError("Handle is not open, cannot finalize an fd on a closed send handle.") + + with nogil: + derr = dragon_fli_finalize_writable_fd(&self._sendh) + if derr != DRAGON_SUCCESS: + raise DragonFLIError(derr, "Could not finalize writable file descriptor") + + + +cdef class FLIRecvH: + """ + Receiving handle for FLInterfaces + """ + + cdef: + dragonFLIRecvHandleDescr_t _recvh + dragonFLIDescr_t _adapter + bool _is_open + + def __init__(self, FLInterface adapter, Channel stream_channel=None, timeout=None, use_main_as_stream_channel=False): + """ + Open the handle, optionally with a specific stream channel and timeout + """ + cdef: + dragonError_t derr + dragonChannelDescr_t * c_strm_ch = NULL + timespec_t timer + timespec_t* time_ptr + + # This seems short, might flesh out more later + self._adapter = adapter._adapter + + time_ptr = _computed_timeout(timeout, &timer) + + if stream_channel is not None: + c_strm_ch = &stream_channel._channel + + if use_main_as_stream_channel: + c_strm_ch = STREAM_CHANNEL_IS_MAIN_FOR_1_1_CONNECTION + + with nogil: + derr = dragon_fli_open_recv_handle(&self._adapter, &self._recvh, c_strm_ch, time_ptr) + if derr != DRAGON_SUCCESS: + raise DragonFLIError(derr, "Could not open receive handle stream") + + self._is_open = True + + def close(self, timeout=None): + cdef: + dragonError_t derr + timespec_t timer + timespec_t* time_ptr + + if not self._is_open: + return + + time_ptr = _computed_timeout(timeout, &timer) + + with nogil: + derr = dragon_fli_close_recv_handle(&self._recvh, time_ptr) + if derr != DRAGON_SUCCESS: + raise DragonFLIError(derr, "Could not close receive handle stream") + + self._is_open = False + + def __del__(self): + try: + self.close(timeout=DEFAULT_CLOSE_TIMEOUT) + except: + pass + + def recv_bytes_into(self, unsigned char[::1] bytes_buffer=None, timeout=None): + cdef: + uint64_t arg + size_t max_bytes + size_t num_bytes + timespec_t timer + timespec_t* time_ptr + + if self._is_open == False: + raise RuntimeError("Handle is not open, cannot receive") + + time_ptr = _computed_timeout(timeout, &timer) + + max_bytes = len(bytes_buffer) + + # This gets a memoryview slice of the buffer + cdef unsigned char [:] c_data = bytes_buffer + # To pass in as a pointer, get the address of the 0th index &c_data[0] + with nogil: + derr = dragon_fli_recv_bytes_into(&self._recvh, max_bytes, &num_bytes, &c_data[0], &arg, time_ptr) + if derr == DRAGON_EOT: + raise FLIEOT(derr, "End of Transmission") + if derr != DRAGON_SUCCESS: + raise DragonFLIError(derr, "Could not receive into bytes buffer") + + # Landing pad should be populated, just return arg + return arg + + def recv_bytes(self, timeout=None): + cdef: + dragonError_t derr + size_t num_bytes + size_t max_bytes = 0 + uint8_t * c_data + uint64_t arg + timespec_t timer + timespec_t* time_ptr + + if self._is_open == False: + raise RuntimeError("Handle is not open, cannot receive") + + time_ptr = _computed_timeout(timeout, &timer) + + # A max_bytes value of 0 means "get everything" + with nogil: + derr = dragon_fli_recv_bytes(&self._recvh, max_bytes, &num_bytes, &c_data, &arg, time_ptr) + if derr == DRAGON_EOT: + raise FLIEOT(derr, "End of Transmission") + if derr != DRAGON_SUCCESS: + raise DragonFLIError(derr, "Error receiving FLI data") + + # Convert to a memoryview + py_view = PyMemoryView_FromMemory(c_data, num_bytes, BUF_WRITE) + # Convert memoryview to bytes + py_bytes = py_view.tobytes() + # Release underlying malloc now that we have a copy + free(c_data) + # Return data and metadata as a tuple + return (py_bytes, arg) + + def recv_mem(self, timeout=None): + cdef: + dragonError_t derr + dragonMemoryDescr_t mem + uint64_t arg + timespec_t timer + timespec_t* time_ptr + + if self._is_open == False: + raise RuntimeError("Handle is not open, cannot receive memory object") + + time_ptr = _computed_timeout(timeout, &timer) + + with nogil: + derr = dragon_fli_recv_mem(&self._recvh, &mem, &arg, time_ptr) + if derr == DRAGON_EOT: + raise FLIEOT(derr, "End of Transmission") + if derr != DRAGON_SUCCESS: + raise DragonFLIError(derr, "Error receiving FLI data into memory object") + + mem_obj = MemoryAlloc.cinit(mem) + return (mem_obj, arg) + + def create_fd(self, timeout=None): + """ + Creates a readable file-descriptor and returns it. + """ + cdef: + dragonError_t derr + int fdes + timespec_t timer + timespec_t* time_ptr + + if self._is_open == False: + raise RuntimeError("Handle is not open, cannot create a file descriptor on a closed handle.") + + time_ptr = _computed_timeout(timeout, &timer) + + with nogil: + derr = dragon_fli_create_readable_fd(&self._recvh, &fdes, time_ptr) + if derr != DRAGON_SUCCESS: + raise DragonFLIError(derr, "Could not open readable file descriptor") + + return fdes + + def finalize_fd(self): + """ + Flushes a file-descriptor and waits until all buffers are read and the + file descriptor is closed. + """ + cdef: + dragonError_t derr + + if self._is_open == False: + raise RuntimeError("Handle is not open, cannot finalize an fd on a closed receive handle.") + + with nogil: + derr = dragon_fli_finalize_readable_fd(&self._recvh) + if derr != DRAGON_SUCCESS: + raise DragonFLIError(derr, "Could not finalize readable file descriptor") + + + + +cdef class FLInterface: + """ + Cython wrapper for the File-Like-Interface + """ + + cdef: + dragonFLIDescr_t _adapter + dragonFLISerial_t _serial + bool _is_serialized + list stream_channel_list + MemoryPool pool + + + def __getstate__(self): + return (self.serialize(),self.pool) + + def __setstate__(self, state): + serial_fli, pool = state + if not pool.is_local: + pool = None + self._attach(serial_fli, pool) + + + def _attach(self, ser_bytes, MemoryPool pool=None): + cdef: + dragonError_t derr + dragonFLISerial_t _serial + dragonMemoryPoolDescr_t * mpool = NULL + + _serial.len = len(ser_bytes) + cdef const unsigned char[:] cdata = ser_bytes + _serial.data = &cdata[0] + self._is_serialized = False + + if pool is not None: + mpool = &pool._pool_hdl + self.pool = pool + else: + self.pool = None + + derr = dragon_fli_attach(&_serial, mpool, &self._adapter) + if derr != DRAGON_SUCCESS: + raise DragonFLIError(derr, "Could not attach to FLI adapter") + + return self + + def __del__(self): + if self._is_serialized: + dragon_fli_serial_free(&self._serial) + + def __init__(self, Channel main_ch=None, Channel manager_ch=None, MemoryPool pool=None, + stream_channels=[], bool use_buffered_protocol=False): + + cdef: + dragonError_t derr + dragonChannelDescr_t ** strm_chs = NULL + dragonChannelDescr_t * c_main_ch = NULL + dragonChannelDescr_t * c_mgr_ch = NULL + dragonMemoryPoolDescr_t * c_pool = NULL + Channel ch # Necessary to cast python objects into cython objects when pulling out stream_channel values + + self._is_serialized = False + + ### + ### If creating main and manager channels, make sure their capacity is set to the number of stream channels + ### + + num_stream_channels = len(stream_channels) + + if pool is None and main_ch is None: + # Get default pool muid and create a main_channel from there + default_muid = dfacts.default_pool_muid_from_index(dparms.this_process.index) + ch_options = ChannelOptions(capacity=num_stream_channels) + main_ch = dgchan.create(default_muid, options=ch_options) + + if pool is None and main_ch is not None: + # Do nothing, C code handles this + pass + + # Get pointers to the handles + # This simplifies the actual C call since the pointers will either be NULL or assigned to the objects handle + if main_ch is not None: + c_main_ch = &main_ch._channel + + if manager_ch is not None: + c_mgr_ch = &manager_ch._channel + + if pool is not None: + c_pool = &pool._pool_hdl + self.pool = pool + else: + self.pool = None + + if num_stream_channels > 0: + strm_chs = malloc(sizeof(dragonChannelDescr_t*) * num_stream_channels) + for i in range(num_stream_channels): + ch = stream_channels[i] + strm_chs[i] = &ch._channel + + derr = dragon_fli_create(&self._adapter, c_main_ch, c_mgr_ch, c_pool, + num_stream_channels, strm_chs, use_buffered_protocol, NULL) + + if strm_chs != NULL: + free(strm_chs) # Free our Malloc before error checking to prevent memory leaks + if derr != DRAGON_SUCCESS: + raise DragonFLIError(derr, "Failed to create new FLInterface") + + @classmethod + def create_buffered(cls, Channel main_ch=None, MemoryPool pool=None): + """ + Helper function to more easily create a simple buffered FLInterface + Does not require any internal function, it's simply limiting the number of options for the user + in order to make it more straightforward to make an explicitly buffered FLI + """ + return cls(main_ch=main_ch, pool=pool, use_buffered_protocol=True) + + + def destroy(self): + cdef dragonError_t derr + + derr = dragon_fli_destroy(&self._adapter) + if derr != DRAGON_SUCCESS: + raise DragonFLIError(derr, "Failed to destroy FLInterface") + + + def serialize(self): + cdef dragonError_t derr + + if not self._is_serialized: + derr = dragon_fli_serialize(&self._adapter, &self._serial) + if derr != DRAGON_SUCCESS: + raise DragonFLIError(derr, "Failed to serialize FLInterface") + + self._is_serialized = True + + py_obj = self._serial.data[:self._serial.len] + return py_obj + + @classmethod + def attach(cls, serialized_bytes, mem_pool=None): + # If mem_pool is None, the default node-local memorypool will be used + empty_fli = cls.__new__(cls) + return empty_fli._attach(serialized_bytes, mem_pool) + + def detach(self): + cdef dragonError_t derr + + derr = dragon_fli_detach(&self._adapter) + if derr != DRAGON_SUCCESS: + raise DragonFLIError(derr, "Failed to detach from FLI adapter") + + def sendh(self, *args, **kwargs): + """ + Return a new FLI Send Handle object + """ + return FLISendH(self, *args, **kwargs) + + def recvh(self, *args, **kwargs): + """ + Return a new FLI Recv Handle object + """ + return FLIRecvH(self, *args, **kwargs) diff --git a/src/dragon/pydragon_heap.pyx b/src/dragon/pydragon_heap.pyx index 236858b..379a23c 100644 --- a/src/dragon/pydragon_heap.pyx +++ b/src/dragon/pydragon_heap.pyx @@ -8,7 +8,7 @@ cdef class PriorityHeap: cdef dragonPriorityHeap_t _hdl def _handle_err(self, derr, msg): - RuntimeError(f"Priority Heap Error: {msg}, Dragon Err code:({derr})") + RuntimeError(f"Priority Heap Error: {msg}, Dragon Err code:({dragon_get_rc_string(derr)})") @staticmethod def create(dragonPriorityHeapUint_t base, dragonPriorityHeapLongUint_t capacity, diff --git a/src/dragon/pydragon_heapmanager.pyx b/src/dragon/pydragon_heapmanager.pyx index 620ae10..ffdef3d 100644 --- a/src/dragon/pydragon_heapmanager.pyx +++ b/src/dragon/pydragon_heapmanager.pyx @@ -98,7 +98,7 @@ cdef class BitSet: self._handle_error(derr, "Error destroying BitSet") def _handle_error(self, dragonError_t derr, err_msg): - raise RuntimeError(err_msg + f" (Dragon Bitset error code={derr})") + raise RuntimeError(err_msg + f" (Dragon Bitset error code={dragon_get_rc_string(derr)})") def get_num_bits(self): @@ -234,7 +234,7 @@ cdef class Heap: self._valid = False def _handle_error(self, dragonError_t derr, err_msg): - raise RuntimeError(err_msg + f" (Dragon DynHeap error code={derr})") + raise RuntimeError(err_msg + f" (Dragon DynHeap error code={dragon_get_rc_string(derr)})") @staticmethod def size(const size_t max_sz_pwr, const size_t min_sz_pwr, const size_t alignment, lock_kind) -> size_t: @@ -243,7 +243,7 @@ cdef class Heap: derr = dragon_heap_size(max_sz_pwr, min_sz_pwr, alignment, lock_kind.value, &mem_size) if derr != DRAGON_SUCCESS: - raise RuntimeError(f"Failed to get DynHeap size. Dragon DynHeap error code=({derr})") + raise RuntimeError(f"Failed to get DynHeap size. Dragon DynHeap error code=({dragon_get_rc_string(derr)})") return mem_size diff --git a/src/dragon/pydragon_lock.pyx b/src/dragon/pydragon_lock.pyx index 7b2e35e..db8eb3a 100644 --- a/src/dragon/pydragon_lock.pyx +++ b/src/dragon/pydragon_lock.pyx @@ -17,7 +17,7 @@ cdef class DragonLock: cdef dragonLock_t _lock def _handle_err(self, derr, err_msg): - raise RuntimeError(err_msg + f" (Dragon Lock Error Code={derr})") + raise RuntimeError(err_msg + f" (Dragon Lock Error Code={dragon_get_rc_string(derr)})") @staticmethod def size(kind): @@ -95,7 +95,7 @@ cdef class GreedyLock: cdef dragonGreedyLock_t _lock def _handle_err(self, derr, err_msg): - raise RuntimeError(err_msg + f" (Dragon Lock Error Code={derr})") + raise RuntimeError(err_msg + f" (Dragon Lock Error Code={dragon_get_rc_string(derr)})") @staticmethod def size(): @@ -174,7 +174,7 @@ cdef class FIFOLock: cdef dragonFIFOLock_t _lock def _handle_err(self, derr, err_msg): - raise RuntimeError(err_msg + f" (Dragon Shared Lock Error Code={derr})") + raise RuntimeError(err_msg + f" (Dragon Shared Lock Error Code={dragon_get_rc_string(derr)})") @staticmethod def size(): diff --git a/src/dragon/pydragon_managed_memory.pyx b/src/dragon/pydragon_managed_memory.pyx index 560ec47..64fd497 100644 --- a/src/dragon/pydragon_managed_memory.pyx +++ b/src/dragon/pydragon_managed_memory.pyx @@ -100,7 +100,7 @@ cdef class MemoryPoolAttr: derr = dragon_memory_attr_init(&self._mattr) if derr != DRAGON_SUCCESS: - raise RuntimeError(f"MemoryAttr Error: Unable to initialize memory attribute. Dragon Error Code: ({derr})") + raise RuntimeError(f"MemoryAttr Error: Unable to initialize memory attribute. Dragon Error Code: ({dragon_get_rc_string(derr)})") if pre_alloc_blocks is not None: self._mattr.npre_allocs = len(pre_alloc_blocks) @@ -356,7 +356,7 @@ cdef class MemoryPool: derr = dragon_memory_attr_init(&self._mattr) if derr != DRAGON_SUCCESS: - raise RuntimeError(f"MemoryAttr Error: Unable to initialized memory attribute. Dragon Error Code: ({derr})") + raise RuntimeError(f"MemoryAttr Error: Unable to initialized memory attribute. Dragon Error Code: ({dragon_get_rc_string(derr)})") # @MCB: if pre_alloc_blocks is used, build mattr struct if pre_alloc_blocks is not None: diff --git a/src/dragon/pydragon_pmod.pyx b/src/dragon/pydragon_pmod.pyx index ecb22eb..0a745b1 100644 --- a/src/dragon/pydragon_pmod.pyx +++ b/src/dragon/pydragon_pmod.pyx @@ -81,7 +81,7 @@ cdef class PMOD: for hostname in hostname_list: hostname_bytes = hostname.encode('utf-8') hostname_c = hostname_bytes - strncpy(&self._job_params.hostnames[i].name[0], hostname_c, PMOD_MAX_HOSTNAME_LEN) + strncpy(&self._job_params.hostnames[i].name[0], hostname_c, PMOD_MAX_HOSTNAME_LEN-1) i = i + 1 diff --git a/src/dragon/pydragon_utils.pyx b/src/dragon/pydragon_utils.pyx index b3b0ba6..e426555 100644 --- a/src/dragon/pydragon_utils.pyx +++ b/src/dragon/pydragon_utils.pyx @@ -91,4 +91,4 @@ cdef class B64: :param the_str: base64 encoded string. :return: original bytes representation. """ - return cls.from_str(the_str).decode() \ No newline at end of file + return cls.from_str(the_str).decode() diff --git a/src/dragon/transport/tcp/__main__.py b/src/dragon/transport/tcp/__main__.py index ea50e49..b77a335 100644 --- a/src/dragon/transport/tcp/__main__.py +++ b/src/dragon/transport/tcp/__main__.py @@ -229,6 +229,10 @@ async def tcp_transport_agent(node_index: str = None, if isinstance(msg, halt_msg): LOGGER.info(f'Received {type(msg)}') break + elif isinstance(msg, dmsg.TAUpdateNodes): + agent.update_nodes(msg.nodes) + LOGGER.info(f'Received {type(msg)}') + continue LOGGER.warning(f'Received unsupported control message: {msg}') LOGGER.debug('Agent is not running, terminating') diff --git a/src/dragon/transport/tcp/agent.py b/src/dragon/transport/tcp/agent.py index 7255ac0..89ae2ac 100644 --- a/src/dragon/transport/tcp/agent.py +++ b/src/dragon/transport/tcp/agent.py @@ -10,6 +10,8 @@ from .transport import Address, LOOPBACK_ADDRESS_IPv4, Transport from ...dtypes import DEFAULT_WAIT_MODE, WaitMode +from ...infrastructure.node_desc import NodeDescriptor + LOGGER = logging.getLogger('dragon.transport.tcp.agent') @@ -165,6 +167,41 @@ def add_client(self, client: Client): if self.is_running(): self._start_client(client) + def _update_client_nodes(self, node_update_map: dict[int, Address]): + """Update nodes dictionary for routing messages in the client + + :param nodes: Nodes to add to our internal data + :type nodes: list[NodeDescriptor] + """ + for client in self._clients: + client.update_nodes(node_update_map) + + def update_nodes(self, nodes: list[NodeDescriptor]): + """Update the nodes dictionary used to route messages + + :param nodes: Nodes to add to our internal data + :type nodes: list[NodeDescriptor] + """ + # Update local object + try: + node_update_map = {} + + for node in nodes: + try: + addr = Address.from_netloc(str(node.ip_addrs[0])) + node_update_map[int(node.host_id)] = Address(addr.host, addr.port or node.port) + except Exception: + LOGGER.critical(f'Failed to update agent node-address mapping for {node}') + raise + + self.nodes.update(node_update_map) + except Exception: + LOGGER.critical(f'Failed to update agent node-address mapping') + raise + + # Update clients + self._update_client_nodes(node_update_map) + if __name__ == '__main__': from .task import cancel_all_tasks diff --git a/src/dragon/transport/tcp/client.py b/src/dragon/transport/tcp/client.py index e832173..31d9230 100644 --- a/src/dragon/transport/tcp/client.py +++ b/src/dragon/transport/tcp/client.py @@ -5,6 +5,8 @@ from ...channels import GatewayMessage, Channel, ChannelEmpty, ChannelRecvTimeout from ...dtypes import WaitMode, DEFAULT_WAIT_MODE +from ...infrastructure.node_desc import NodeDescriptor + from .errno import get_errno, DRAGON_TIMEOUT from .io import UUIDBytesIO from .messages import ErrorResponse, EventRequest, EventResponse, \ @@ -107,6 +109,18 @@ async def recv(self, interval: float = 0.1) -> GatewayMessage: finally: msg.destroy() + def update_nodes(self, node_update_map: dict[int, Address]): + """Update the node dictionary for routing gateway requests + + :param nodes: Nodes to add to our internal data + :type nodes: list[NodeDescriptor] + """ + try: + self.nodes.update(node_update_map) + except Exception: + LOGGER.critical(f'Failed to update client node-address mapping') + raise + def process(self, msg: GatewayMessage) -> asyncio.Task: # Look up destination node address try: diff --git a/src/include/Makefile b/src/include/Makefile index 2b63d92..d79d08d 100644 --- a/src/include/Makefile +++ b/src/include/Makefile @@ -12,7 +12,8 @@ HEADERS = dragon/bcast.h \ dragon/return_codes.h \ dragon/return_codes_map.h \ dragon/shared_lock.h \ - dragon/utils.h + dragon/utils.h \ + dragon/fli.h DISTFILES = $(addprefix $(INSTALL_DIR)/include/,$(HEADERS)) diff --git a/src/include/dragon/channels.h b/src/include/dragon/channels.h index 3985e64..3314310 100644 --- a/src/include/dragon/channels.h +++ b/src/include/dragon/channels.h @@ -428,6 +428,9 @@ dragon_channel_attach(const dragonChannelSerial_t* ch_ser, dragonChannelDescr_t* dragonError_t dragon_channel_detach(dragonChannelDescr_t* ch); +dragonError_t +dragon_channel_descr_clone(dragonChannelDescr_t * newch_descr, const dragonChannelDescr_t * oldch_descr); + dragonError_t dragon_channel_get_pool(const dragonChannelDescr_t* ch, dragonMemoryPoolDescr_t* pool_descr); diff --git a/src/include/dragon/fli.h b/src/include/dragon/fli.h new file mode 100644 index 0000000..c466223 --- /dev/null +++ b/src/include/dragon/fli.h @@ -0,0 +1,698 @@ +/* + Copyright 2020, 2022 Hewlett Packard Enterprise Development LP +*/ +#ifndef HAVE_DRAGON_FLI_H +#define HAVE_DRAGON_FLI_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** @defgroup fli_structs API Structures + * + * The fli API structures. + * @{ + */ + +static dragonChannelDescr_t* const STREAM_CHANNEL_IS_MAIN_FOR_1_1_CONNECTION = (dragonChannelDescr_t*)0x0000000000001111; + +/** + * @brief The attributes structure of an fli adapter. + * + * This structure contains members that can tune file-like interface (fli) + * adapter. + **/ + +typedef struct dragonFLIAttr_st { + dragonULInt _placeholder; +} dragonFLIAttr_t; + +/** + * @brief An opaque fli descriptor + * + * When a file like interface adapter is created, an fli descriptor is + * initialized for the current process. These fli descriptors may be + * shared with other processes by first serializing them, and then passing the + * serialized descriptor to another process. The other process must then + * attach to the fli adapter using the serialized descriptor. Attaching and + * creating are the two means of initializing an fli descriptor. Serializing + * and attaching are provided as convenience functions. FLI adapters can also + * be re-created from their component parts, but a serialized descriptor + * encapsulates the necessary component parts. + * +*/ +typedef struct dragonFLIDescr_st { + uint64_t _idx; +} dragonFLIDescr_t; + +/** + * @brief A serialized FLI adapter + * + * A serialized FLI adapter can be passed to other processes as a convenience for + * attaching from other processes. FLI adapters can also be re-created from their + * constituent parts. +*/ + +typedef struct dragonFLISerial_st { + size_t len; /*!< The length of the serialized descriptor in bytes. */ + uint8_t * data; /* !< The serialized descriptor data to be shared. */ +} dragonFLISerial_t; + +/***/ +/** + * @brief An FLI Send Handle Descriptor + * + * When an adapter is open for sending, a send handle descriptor is provided + * which is initialized and used until closed. The send handle descriptor is + * an opaque reference to a send handle. + * +*/ +typedef struct dragonFLISendHandleDescr_st { + dragonULInt _idx; +} dragonFLISendHandleDescr_t; + +/** + * @brief An FLI Receive Handle Descriptor + * + * When an adapter is open for receiving, a recv handle descriptor is provided + * which is initialized and used until closed. The recv handle descriptor is + * an opaque reference to a recv handle. + * +*/ +typedef struct dragonFLIRecvHandleDescr_st { + dragonULInt _idx; +} dragonFLIRecvHandleDescr_t; + +/** @} */ // end of fli_structs group. + +/** + * @brief Initialize attributes for a FLI adapter. + * + * Initialize an attributes structure for FLI adapter customization. You + * initialize first by calling this function and then you can customize + * any of the attributes contained within the structure to suit your application. + * + * @param attr is a pointer to the attributes structure to be initialized. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + */ +dragonError_t +dragon_fli_attr_init(dragonFLIAttr_t* attr); + +/** + * @brief Create an FLI adapter. + * + * Create an FLI adapter. An FLI adapter guarantees that a send a receive handle + * is between one sender and one receiver and will not have to deal with data + * interleaving from other processes. In addition, data may be streamed between + * the sender and receiver when the FLI adapter is not used in buffered mode. + * FLI adapters may be created in one of several modes. + * + * When the main channel is provided the FLI adapter will be use in one of + * three modes. + * + * 1. In buffered mode the main channel is used to communicate on a many + * to many style connection where each send conversation is a complete conversation + * between a sender and a receiver. In this mode, if multiple sends are done, they + * are buffered before sending. In buffered mode receivers will receive the sent + * message as one receive operation, even if multiple sends were performed. + * + * 2. In non-buffered mode the main channel is a channel of stream channels and the + * API manages allocating a stream channel to an open send handle and providing that + * stream channel to a receiver by placing its serialized descriptor into the main + * channel to be picked up by opening a receive handle. In this case the main channel + * is used to manage 1:1 conversations between an open send handle and an open receive handle. + * + * When using this mode, the stream channels come from one of two location. Either + * there is a manager channel which manages a set of stream channels to be used when + * sending data, OR a sender may provide a stream channel when opening the send handle. + * + * 3. There is one special case when a main channel is used in non-buffered mode + * and it is known that there is a single sender and single receiver using the FLI + * adapter. In this case, both the sender and receiver must specify a special + * constant of STREAM_CHANNEL_IS_MAIN_FOR_1_1_CONNECTION for the stream channel + * argument when opening the send handle and when opening the receive handle. + * + * If the mgr_ch is not NULL, then it is used in one of a couple different + * ways. Please note: When creating an FLI adapter using the buffered protocol + * no manager channel should be specified. + * + * 1. When created in non-buffered mode, the manager channel contains a set + * of serialized descriptors for stream channels that will be provided + * to send handles when they are opened. If no main channel exists, then + * the stream channel must be provided when the receiver opens a receive + * handle. If stream channels are provided when the receive handle is opened + * then no main channel is required. + * + * 2. When the fli adapter is created, the user may provide a main channel, + * a manager channel, and a set of stream channels. In this case, the fli + * adapter will maintain the stream channels and dole them out and re-use them + * as send and receive handles are opened and closed, always guaranteeing that + * any conversation between sender and receiver is will not be interleaved with + * data from other processes. + * + * If desired, a stream channel can be provided on either send handle open or + * receive handle open operations. In that way, the stream channel can be + * allocated by either the sender or receiver, but not both. When a stream + * channel is provided on a send or receive open operation your application + * must decide whether the senders or receivers will be supplying the stream + * channels. When stream channels are provided on send handle open operations, + * a manager channel is not necessary. When stream channels are provided on + * receive handle open operations, a main channel is not necessary. + * + * Sharing FLI adapters is possible either by serializing and attaching to the + * adapter or by re-creating it from its constituent parts. NOTE: When + * re-creating an adapter by calling this function, the strm_channels should + * only be provided on the initial call to create. Providing them a second + * time will result in the channels being added more than once into the + * adapter which could lead to unpredictable results. + * + * @param adapter is a descriptor and opaque handle to the FLI adapter and is + * initialized by this call. + * + * @param main_ch is a channel descriptor for the main channel of this FLI + * adapter. It is used internally in the adapter. After the life of the + * adapter it is up to user code to clean up this channel. + * + * @param mgr_ch is a channel used internally by the FLI adapter and not to be + * touched by user code during the life of the adapter. After the life of the + * adapter it is up to user code to clean up this channel. Supplying a NULL + * mgr_ch argument indicates this is either a buffered FLI adapter and must be + * accompanied by a value of 0 for the num_fli_chs argument or a stream channel + * will be supplied on all send operations. + * + * @param pool is a pool to use for internal allocations necessary for the + * operation of the pool. If pool is NULL, then the pool of the + * main_ch channel will be used for required adapter allocations. If the main + * channel is not local, the default local pool will be used. + * + * @param num_strm_chs is the number of supplied stream channels that are provided + * on the creation of the FLI adapter. Each stream channel may be re-used and + * is used for one stream of messages that result from an open, multiple sends, and a + * close operation. + * + * @param strm_channels is an array of channel descriptors, num_strm_chs of them, that + * are being supplied on the adapter creation. See the longer discussion in the description + * of the adapter create above. The application is responsible for the clean up of these + * channels at the end of their life. + * +* @param use_buffered_protocol if true then only a main channel should provided, no + * manager channel or stream channels are required. In this case all sent data is + * buffered into one message for each file write operation (all sends on an open send + * handle). The receiving side receives one message per completed file write operation. + * + * @param attr is a pointer to the attributes structure that was previously + * inited. If the attr arg is NULL the default attributes will be used. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + */ +dragonError_t +dragon_fli_create(dragonFLIDescr_t* adapter, dragonChannelDescr_t* main_ch, + dragonChannelDescr_t* mgr_ch, dragonMemoryPoolDescr_t* pool, + const dragonULInt num_strm_chs, dragonChannelDescr_t** strm_channels, + const bool use_buffered_protocol, dragonFLIAttr_t* attrs); + +/** + * @brief Destroy the adapter. + * + * All internal, process local resources are freed by making this call. Calling + * destroy does not destroy the underlying channels which were provided when + * the adapter was created. + * + * @param adapter is a descriptor and opaque handle to the FLI adapter. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + */ +dragonError_t +dragon_fli_destroy(dragonFLIDescr_t* adapter); + +/** + * @brief Serialize a FLI adapter for sharing with another process When sharing + * an FLI adapter with another process you may use this function to create a + * shareable serialized descriptor. This creates a binary string which may not + * be ASCII compliant. Before sharing, if ASCII compliance is required, call a + * base64 encoder like the dragon_base64_encode found in dragon/utils.h before + * sharing and dragon_base64_decode before attaching from the other process. + * + * NOTE: You must call dragon_fli_serial_free to free a serialized descriptor + * after calling this function to free the extra space allocated by this + * function once you are done with the serialized descriptor. + * + * @param adapter is a valid FLI adapter that has previously been created or attached. + * + * @param serial is a serialized descriptor that will be initialized with the correct + * byte count and serialized bytes for so it can be passed to another process. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. +**/ +dragonError_t +dragon_fli_serialize(const dragonFLIDescr_t* adapter, dragonFLISerial_t* serial); + + +/** + * @brief Free the internal resources of a serialized FLI descriptor + * + * This frees internal structures of a serialized FLI descriptor. It does not + * destroy the FLI adapter itself. + * + * @param serial is a serialized FLI descriptor. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + */ +dragonError_t +dragon_fli_serial_free(dragonFLISerial_t* serial); + + +/** + * @brief Attach to an FLI adapter + * + * Calling this attaches to a FLI adapter by using a serialized FLI descriptor + * that was passed to this process. The serialized FLI descriptor must have + * been created using the dragon_FLI_serialize function. + * + * @param serial is a pointer to the serialized FLI descriptor. + * + * @param pool is the pool to use for memory allocations when sending or + * receiving on this adapter. If NULL is provided, then the default node-local + * memory pool will be used. + * + * @param adapter is a pointer to an FLI descriptor that will be initialized by + * this call. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + */ +dragonError_t +dragon_fli_attach(const dragonFLISerial_t* serial, const dragonMemoryPoolDescr_t* pool, + dragonFLIDescr_t* adapter); + +/** + * @brief Detach from an adapter. + * + * All internal, process local resources are freed by making this call. Calling + * detach does not destroy or detach the underlying channels which were + * provided when the adapter was created. + * + * @param adapter is a descriptor and opaque handle to the FLI adapter. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + */ +dragonError_t +dragon_fli_detach(dragonFLIDescr_t* adapter); + +/** + * @brief Open a Send Handle + * + * When writing to the file like adapter interface you must first open a send + * handle, write using the send operation, and then close the send handle. The + * adapter guarantees that a receiver will receive the data in the same order + * it was sent, but not necessarily in the same size chunks. + * + * @param adapter is a created or attached FLI descriptor. + * + * @param send_handle is a send handle that will be initialized by this call and + * is to be used on subsequent send operations until this stream is closed. + * + * @param strm_ch is a stream channel to be used as a direct connection to a + * receiving process. A stream channel can only be specified for a receiver or + * a sender, but not both. + * + * As a special case, when there is a known single receiver and single sending + * using this FLI adapter, the special constant + * STREAM_CHANNEL_IS_MAIN_FOR_1_1_CONNECTION may be used for this stream channel + * argument. In that case, the same constant must be used for the stream channel + * when opening the receive handle. No manager channel should exist in this case. + * As the constant indicates, the main channel will be used as the stream channel + * in this special case. + * + * @param timeout is a pointer to a timeout structure. If NULL, then wait forever + * with no timeout. If not NULL, then wait for the specified amount of time and + * return DRAGON_TIMEOUT if not sucessful. If 0,0 is provided, then that indicates + * that a try-once attempt is to be made. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. +**/ +dragonError_t +dragon_fli_open_send_handle(const dragonFLIDescr_t* adapter, dragonFLISendHandleDescr_t* send_handle, + dragonChannelDescr_t* strm_ch, const timespec_t* timeout); + +/** + * @brief Close a Send Handle + * + * All send operations between an open and a close operation are guaranteed to be received + * in order by a receiving process. A send handle should be closed once the sender has + * completed sending data. Any buffered data is sent upon closing the send handle. + * + * @param send_handle is the open send handle to be closed. + * + * @param timeout to be used in attempting to send to the adapter's channel. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. +**/ +dragonError_t +dragon_fli_close_send_handle(dragonFLISendHandleDescr_t* send_handle, + const timespec_t* timeout); + +/** + * @brief Open a Receive Handle + * + * When receiving from the file like adapter interface you must first open a + * receive handle, receive using the recv operation, and then close the + * receive handle. The adapter guarantees that a receiver will receive the + * data in the same order it was sent, but not necessarily in the same size + * chunks. + * + * @param adapter is a created or attached FLI descriptor. + * + * @param recv_handle is a receive handle that will be initialized by this call and + * is to be used on subsequent recv operations until this stream is closed. + * + * @param strm_ch is a stream channel to be used as a direct connection to a + * receiving process. A stream channel can only be specified for a receiver or + * a sender, but not both. When using the buffered protocol it is not valid + * to use a stream channel. When not providing a stream channel, NULL should be + * specified. + * + * As a special case, when there is a known single receiver and single sending + * using this FLI adapter, the special constant + * STREAM_CHANNEL_IS_MAIN_FOR_1_1_CONNECTION may be used for this stream channel + * argument. In that case, the same constant must be used for the stream channel + * when opening the send handle. No manager channel should exist in this case. + * As the constant indicates, the main channel will be used as the stream channel + * in this special case. + * + * @param timeout is a pointer to a timeout structure. If NULL, then wait forever + * with no timeout. If not NULL, then wait for the specified amount of time and + * return DRAGON_TIMEOUT if not sucessful. If 0,0 is provided, then that indicates + * that a try-once attempt is to be made. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. +**/ +dragonError_t +dragon_fli_open_recv_handle(const dragonFLIDescr_t* adapter, dragonFLIRecvHandleDescr_t* recv_handle, + dragonChannelDescr_t* strm_ch, const timespec_t* timeout); + +/** + * @brief Close a Recv Handle + * + * All receive operations between an open and a close operation are guaranteed to be received + * in order by a receiving process. A recv handle should be closed once the sender has + * completed sending data. End of transmission will be indicated by a return code on a recv + * operation. + * + * @param recv_handle is the open receive handle to be closed. + * + * @param timeout is used for returning the stream channel to the adapter in some + * configurations. Otherwise it is ignored. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. +**/ +dragonError_t +dragon_fli_close_recv_handle(dragonFLIRecvHandleDescr_t* recv_handle, const timespec_t* timeout); + + +/** + * @brief Create a file descriptor to send bytes over an FLI adapter. + * + * All writes to the file descriptor will be sent over the FLI adapter. Writes + * are either buffered or sent immediately as chosen on the call to this + * function. Internally, an send handle is opened when the file descriptor is + * created and closed when the file descriptor is closed. A stream channel may + * be supplied depending on the chosen form of transport. The timeout + * + * @param adapter is a created or attached FLI descriptor. + * + * @param send_handle is a send_handle for the FLI descriptor. It should + * be initialized before calling this function. After closing the returned + * file descriptor, the send_handle should also be closed to insure proper + * operation. + * + * @param fd_ptr is a pointer to an integer. The integer will be initialized to + * the file descriptor value. + * + * @param buffer is a constant of either false (or 0 or NULL), which means use + * the default behavior, or true in which case it buffers the data until + * the file descriptor is closed. + * + * @param chunk_size is the size of chunks that are attempted to be read from + * the file descriptor on each send operation. This can be used to fine-tune + * message sending efficiency through the file descriptor. A chunk size of 0 + * will result in using the default chunk size of 1K chunks. + * + * @param arg is a user-defined 64-bit argument to be passed through on the writes + * to the fli. This argument is not retrievable via a readable file descriptor, but + * it is accessible via other method of reading from the fli. + * + * @param timeout is a pointer to a timeout structure. If NULL, then wait forever + * with no timeout to open the file descriptor. If not NULL, then wait for the + * specified amount of time and return DRAGON_TIMEOUT if not sucessful. If 0,0 + * is provided, then that indicates that a try-once attempt is to be made. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + **/ +dragonError_t +dragon_fli_create_writable_fd(dragonFLISendHandleDescr_t* send_handle, int* fd_ptr, + const bool buffer, size_t chunk_size, + const uint64_t arg, const timespec_t* timeout); + +/** + * @brief Finalize and destroy the writable file descriptor. + * + * This should be called after closing the created writable file descriptor to + * insure that all buffers are flushed before continuing to use the send handle. + * Note that calling this will hang if the file descriptor has not been closed + * prior to this call. + * + * @param send_handle is a valid send handle that was previously used to create a + * writable file descriptor. + * + * @return DRAGON_SUCESS or a return code to indicate what problem occurred. + */ +dragonError_t +dragon_fli_finalize_writable_fd(dragonFLISendHandleDescr_t* send_handle); + + +/** + * @brief Create a file descriptor to receive bytes over an FLI adapter. + * + * All reads from the file descriptor will be received over the FLI adapter. + * Receives are returned as they were sent. If buffering was used during + * sending, then reads from the file descriptor may not match in size and + * quantity that writes were done. However, total quantity of bytes of + * information will be as it was sent. Internally, a receive handle is opened + * when the file descriptor is created and closed when the file descriptor + * signals end of stream. A stream channel may be supplied depending on the + * chosen form of transport. + * + * @param adapter is a created or attached FLI descriptor. + * + * @param recv_handle is a recv_handle for the FLI descriptor. It should + * be initialized before calling this function. After closing the returned + * file descriptor, the recv_handle should also be closed to insure proper + * operation. + * + * @param fd_ptr is a pointer to an integer. The integer will be initialized to + * the file descriptor value. + * + * @param timeout is a pointer to a timeout structure. If NULL, then wait forever + * with no timeout to open the file descriptor. If not NULL, then wait for the + * specified amount of time and return DRAGON_TIMEOUT if not sucessful. If 0,0 + * is provided, then that indicates that a try-once attempt is to be made. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + **/ +dragonError_t +dragon_fli_create_readable_fd(dragonFLIRecvHandleDescr_t* recv_handle, int* fd_ptr, + const timespec_t* timeout); + +/** + * @brief Finalize and destroy the readable file descriptor. + * + * This should be called after closing the created readable file descriptor to + * insure that all buffers are flushed before continuing to use the receive handle. + * Note that calling this will hang if the file descriptor has not been closed + * prior to this call. + * + * @param recv_handle is a valid receive handle that was previously used to create a + * readable file descriptor. + * + * @return DRAGON_SUCESS or a return code to indicate what problem occurred. + */ +dragonError_t +dragon_fli_finalize_readable_fd(dragonFLIRecvHandleDescr_t* recv_handle); + + +/** + * @brief Send bytes through the FLI adapter. + * + * All send operations between an open and a close of a send handle are guaranteed to + * be received by one receiver in the order they were sent. + * + * @param send_handle is an open send handle. + * + * @param num_bytes is the number of bytes to be sent. + * + * @param bytes is a pointer to the data to be sent. + * + * @param arg is meta-data assigned in a 64-bit field that can be set and will be + * received by the receiving side. It does not affect the message itself. When using + * the buffered protocol, only the first write into an open send handle will allow + * this arg to be passed along. All other values of this arg on subsequent writes + * to an open send handle are ignored. + * + * @param buffer is a constant of either false (or 0 or NULL), which means use + * the default behavior, or true in which case it buffers the data until + * it is told to flush the data by either sending more data with buffer == false + * or by closing the send handle. This is only valid when NOT using the + * buffered protocol and you want to buffer the data into one message before + * sending. This argument is ignored when sending via a buffered adapter. + * + * @param timeout is a pointer to a timeout structure. If NULL, then wait forever + * with no timeout. If not NULL, then wait for the specified amount of time and + * return DRAGON_TIMEOUT if not sucessful. If 0,0 is provided, then that indicates + * that a try-once attempt is to be made. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + **/ +dragonError_t +dragon_fli_send_bytes(dragonFLISendHandleDescr_t* send_handle, size_t num_bytes, + uint8_t* bytes, uint64_t arg, const bool buffer, const timespec_t* timeout); + +/** + * @brief Send shared memory through the FLI adapter. + * + * All send operations between an open and a close of a send handle are guaranteed to + * be received by one receiver in the order they were sent. + * + * @param mem is a memory descriptor pointer to Dragon managed memory to be sent. + * + * @param arg is meta-data assigned in a 64-bit field that can be set and will be + * received by the receiving side. It does not affect the message itself. When using + * the buffered protocol, only the first write into an open send handle will allow + * this arg to be passed along. All other values of this arg on subsequent writes + * to an open send handle are ignored. + * + * @param timeout is a pointer to a timeout structure. If NULL, then wait forever + * with no timeout. If not NULL, then wait for the specified amount of time and + * return DRAGON_TIMEOUT if not sucessful. If 0,0 is provided, then that indicates + * that a try-once attempt is to be made. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + **/ +dragonError_t +dragon_fli_send_mem(dragonFLISendHandleDescr_t* send_handle, dragonMemoryDescr_t* mem, + uint64_t arg, const timespec_t* timeout); + +/** + * @brief Receive data from the FLI adapter. + * + * All receive operations between an open and a close of a recv handle are + * guaranteed to be received by one receiver in the order they were sent. If + * the return code comes comes back with DRAGON_EOT, then there is no more + * data to be received. When DRAGON_EOT is returned there may be valid data + * with it. The num_bytes will always indicate the amount of valid data returned. + * + * @param recv_handle is an open send handle. + * + * @param requested_size is the maximum number of bytes to receive. There may be + * less bytes received. num_bytes provides the actual number of bytes read. If + * requested_size==0 then all available bytes are read. + * + * @param received_size is a pointer to a variable that will be initialized with the + * number of received bytes. + * + * @param bytes points to a pointer that will be initialized with the received bytes. + * The space pointed to by bytes after this call must be freed. + * + * @param arg is a pointer to meta-data assigned in a 64-bit unsigned integer by + * the sender when the data was sent. + * + * @param timeout is a pointer to a timeout structure. If NULL, then wait forever + * with no timeout. If not NULL, then wait for the specified amount of time and + * return DRAGON_TIMEOUT if not sucessful. If 0,0 is provided, then that indicates + * that a try-once attempt is to be made. + * + * @return DRAGON_SUCCESS, DRAGON_EOT or a return code to indicate what problem + * occurred. When DRAGON_EOT is returned there may also be bytes that were read + * or there may be zero bytes read. + **/ +dragonError_t +dragon_fli_recv_bytes(dragonFLIRecvHandleDescr_t* recv_handle, size_t requested_size, + size_t* received_size, uint8_t** bytes, uint64_t* arg, + const timespec_t* timeout); + +/** + * @brief Receive data from the FLI adapter. + * + * All receive operations between an open and a close of a recv handle are + * guaranteed to be received by one receiver in the order they were sent. If + * the return code comes comes back with DRAGON_EOT, then there is no more + * data to be received. When DRAGON_EOT is returned there may be valid data + * with it. The num_bytes will always indicate the amount of valid data returned. + * + * @param recv_handle is an open send handle. + * + * @param requested_size is the maximum number of bytes to receive. There may be + * less bytes received. num_bytes provides the actual number of bytes read. If + * requested_size==0 then all available bytes are read. + * + * @param received_size is a pointer to a variable that will be initialized with the + * number of received bytes. + * + * @param bytes is a pointer that points to space at least of requested_size. This + * is provided by the caller of this function and will be filled in with + * received_size bytes upon successful completion of this call. + * + * @param arg is a pointer to meta-data assigned in a 64-bit unsigned integer by + * the sender when the data was sent. + * + * @param timeout is a pointer to a timeout structure. If NULL, then wait forever + * with no timeout. If not NULL, then wait for the specified amount of time and + * return DRAGON_TIMEOUT if not sucessful. If 0,0 is provided, then that indicates + * that a try-once attempt is to be made. + * + * @return DRAGON_SUCCESS, DRAGON_EOT or a return code to indicate what problem + * occurred. When DRAGON_EOT is returned there may also be bytes that were read + * or there may be zero bytes read. + **/ +dragonError_t +dragon_fli_recv_bytes_into(dragonFLIRecvHandleDescr_t* recv_handle, size_t requested_size, + size_t* received_size, uint8_t* bytes, uint64_t* arg, + const timespec_t* timeout); + +/** + * @brief Receive a Memory Descriptor from the FLI adapter. + * + * All receive operations between an open and a close of a recv handle are + * guaranteed to be received by one receiver in the order they were sent. This + * operation is a lower-level receive operation that returns the memory descriptor + * that was read from the channel. + * + * @param recv_handle is an open receive handle. + * + * @param mem is a memory descriptor that will be initialized (upon DRAGON_SUCCESS + * completion) with the shared memory where the message is located. + * + * @param arg is a pointer to meta-data assigned in a 64-bit unsigned integer by + * the sender when the data was sent. + * + * @param timeout is a pointer to a timeout structure. If NULL, then wait forever + * with no timeout. If not NULL, then wait for the specified amount of time and + * return DRAGON_TIMEOUT if not sucessful. If 0,0 is provided, then that indicates + * that a try-once attempt is to be made. + * + * @return DRAGON_SUCCESS, DRAGON_EOT or a return code to indicate what problem occurred. + **/ +dragonError_t +dragon_fli_recv_mem(dragonFLIRecvHandleDescr_t* recv_handle, dragonMemoryDescr_t* mem, + uint64_t* arg, const timespec_t* timeout); + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/src/include/dragon/global_types.h b/src/include/dragon/global_types.h index 1e25e6b..2afcf67 100644 --- a/src/include/dragon/global_types.h +++ b/src/include/dragon/global_types.h @@ -22,6 +22,12 @@ typedef uint8_t dragonUUID[16]; typedef struct timespec timespec_t; typedef struct timeval timeval_t; +/* a few global constants */ +/* This following strings must be identical in facts.py */ +#define DRAGON_NUM_GW_ENV_VAR "DRAGON_NUM_GW_CHANNELS_PER_NODE" +#define DRAGON_DEFAULT_PD_VAR "DRAGON_DEFAULT_PD" +#define DRAGON_INF_PD_VAR "DRAGON_INF_PD" + /** * @brief Wait Mode constants * @@ -74,6 +80,20 @@ typedef enum dragonChannelSendReturnWhen_st { DRAGON_CHANNEL_SEND_RETURN_WHEN_NONE } dragonChannelSendReturnWhen_t; +/** + * @brief Constants indicating a type of channel operation. + * + * Constants for send_msg, get_msg and poll channels operations. Currently, + * these are used to help selet a gateway index for a channel operation, + * with the constant's value specifying an offset into a gateway group. + **/ + +typedef enum dragonChannelOpType_st { + DRAGON_OP_TYPE_SEND_MSG = 0, + DRAGON_OP_TYPE_GET_MSG, + DRAGON_OP_TYPE_POLL +} dragonChannelOpType_t; + /** * @brief This is the type of the release function for dragon waiting. * diff --git a/src/include/dragon/managed_memory.h b/src/include/dragon/managed_memory.h index a01746a..055122e 100644 --- a/src/include/dragon/managed_memory.h +++ b/src/include/dragon/managed_memory.h @@ -248,6 +248,9 @@ dragon_memory_pool_attach(dragonMemoryPoolDescr_t * pool_descr, const dragonMemo dragonError_t dragon_memory_pool_attach_from_env(dragonMemoryPoolDescr_t * pool_descr, const char * env_var); +dragonError_t +dragon_memory_pool_attach_default(dragonMemoryPoolDescr_t* pool); + dragonError_t dragon_memory_pool_detach(dragonMemoryPoolDescr_t * pool_descr); @@ -285,6 +288,14 @@ dragonError_t dragon_memory_pool_get_type_allocations(const dragonMemoryPoolDescr_t * pool_descr, const dragonMemoryAllocationType_t type, dragonMemoryPoolAllocations_t * allocs); + + +dragonError_t +dragon_memory_pool_get_pointer(const dragonMemoryPoolDescr_t * pool_descr, void **base_ptr); + +dragonError_t +dragon_memory_pool_get_size(const dragonMemoryPoolDescr_t * pool_descr, size_t *size); + dragonError_t dragon_memory_alloc(dragonMemoryDescr_t * mem_descr, const dragonMemoryPoolDescr_t * pool_descr, const size_t bytes); diff --git a/src/include/dragon/return_codes.h b/src/include/dragon/return_codes.h index 690f9ab..be47f38 100644 --- a/src/include/dragon/return_codes.h +++ b/src/include/dragon/return_codes.h @@ -20,6 +20,7 @@ typedef enum dragonError_st { DRAGON_NOT_FOUND, DRAGON_INVALID_LOCK_KIND, DRAGON_INVALID_SYNC_KIND, + DRAGON_EOT, DRAGON_CHANNEL_MEMORY_POOL_NONLOCAL, DRAGON_CHANNEL_ALREADY_DESTROYED, DRAGON_CHANNEL_BUFFER_ERROR, @@ -107,6 +108,7 @@ typedef enum dragonError_st { DRAGON_BARRIER_BROKEN, DRAGON_BARRIER_WAIT_TRY_AGAIN, DRAGON_BARRIER_READY_TO_RELEASE, + DRAGON_OBJECT_DESTROYED, DRAGON_BAD_RETURN_CODE // This must remain the last return code so the dragon_get_rc_string works correctly. } dragonError_t; diff --git a/src/include/dragon/utils.h b/src/include/dragon/utils.h index fbe351d..31197e1 100644 --- a/src/include/dragon/utils.h +++ b/src/include/dragon/utils.h @@ -39,9 +39,6 @@ dragon_get_pid_from_uuid(dragonUUID uuid); uint32_t dragon_get_ctr_from_uuid(dragonUUID uuid); -dragonError_t -dragon_timespec_deadline(const timespec_t* timer, timespec_t* deadline); - dragonError_t dragon_timespec_add(timespec_t* result, const timespec_t* first, const timespec_t* second); @@ -52,7 +49,10 @@ bool dragon_timespec_le(const timespec_t* first, const timespec_t* second); dragonError_t -dragon_timespec_remaining(const timespec_t * end_time, timespec_t * remaining_timeout); +dragon_timespec_deadline(const timespec_t* timer, timespec_t* deadline); + +dragonError_t +dragon_timespec_remaining(const timespec_t * deadline, timespec_t * remaining_timeout); char* dragon_base64_encode(uint8_t *data, size_t input_length, size_t *output_length); diff --git a/src/lib/_channels.h b/src/lib/_channels.h index c7488be..bd43fcc 100644 --- a/src/lib/_channels.h +++ b/src/lib/_channels.h @@ -32,8 +32,6 @@ extern "C" { #define DRAGON_CHANNEL_NUM_POLL_BCASTS 5 #define DRAGON_CHANNEL_DEFAULT_MAX_EVENT_BCASTS 8 #define DRAGON_CHANNEL_DEFAULT_MAX_GW_ENV_NAME_LENGTH 200 -/* This following string must be identical in dfacts.py */ -#define DRAGON_NUM_GW_ENV_VAR "DRAGON_NUM_GW_CHANNELS_PER_NODE" /* attributes and header info embedded into a Channel */ /* NOTE: This must match the pointers assigned diff --git a/src/lib/_fli.h b/src/lib/_fli.h new file mode 100644 index 0000000..8cdbd6d --- /dev/null +++ b/src/lib/_fli.h @@ -0,0 +1,95 @@ +#ifndef HAVE_DRAGON_FLI_INTERNAL_H +#define HAVE_DRAGON_FLI_INTERNAL_H + +#include + +#include +#include + +#define DRAGON_FLI_UMAP_SEED 1605 +#define FLI_HAS_MAIN_CHANNEL 1 +#define FLI_HAS_MANAGER_CHANNEL 2 +#define FLI_USING_BUFFERED_PROTOCOL 4 +#define FLI_EOT 0xFFFFFFFFFFFFFFFF + +#ifdef __cplusplus +extern "C" { +#endif + +/* seated fli structure */ +typedef struct dragonFLI_st { + dragonChannelDescr_t main_ch; + dragonChannelDescr_t mgr_ch; + dragonMemoryPoolDescr_t pool; + dragonULInt num_strm_chs; + dragonFLIAttr_t attrs; + bool has_main_ch; /* true if main_ch is initialized and used. */ + bool has_mgr_ch; /* true if mgr_ch is initialized and used. */ + bool use_buffered_protocol; /* true if not using stream channels */ +} dragonFLI_t; + +/* buffered allocation used for the buffered protocol on these + adapters. */ +typedef struct dragonFLISendBufAlloc_st { + uint8_t* data; + size_t num_bytes; + size_t offset; /* used only on received buffered bytes */ + uint64_t arg; /* used only on received buffered bytes */ + struct dragonFLISendBufAlloc_st* next; +} dragonFLISendBufAlloc_t; + +/* buffered allocation used for buffered received bytes. */ +typedef struct dragonFLIRecvBufAlloc_st { + dragonMemoryDescr_t mem; + size_t num_bytes; + size_t offset; /* used only on received buffered bytes */ + uint64_t arg; /* used only on received buffered bytes */ + struct dragonFLIRecvBufAlloc_st* next; +} dragonFLIRecvBufAlloc_t; + +/** + * @brief An FLI Send Handle + * + * When an adapter is open for sending, a send handle is provided which + * is initialized and used until closed. + * +*/ +typedef struct dragonFLISendHandle_st { + dragonFLI_t* adapter; + dragonChannelDescr_t strm_channel; + dragonChannelSendh_t chan_sendh; + dragonFLISendBufAlloc_t* buffered_allocations; + uint64_t buffered_arg; + size_t total_bytes; + bool user_supplied; + pthread_t tid; /* used to keep track of send or receive file descriptors */ + int pipe[2]; +} dragonFLISendHandle_t; + +/** + * @brief An FLI Receive Handle + * + * When an adapter is open for receiving, a recv handle is provided which + * is initialized and used until closed. + * +*/ +typedef struct dragonFLIRecvHandle_st { + dragonFLI_t* adapter; + dragonChannelDescr_t strm_channel; + dragonChannelRecvh_t chan_recvh; + bool user_supplied; + bool stream_received; + bool EOT_received; + size_t num_bytes_received; + size_t buffered_bytes; + dragonFLIRecvBufAlloc_t* buffered_data; + dragonFLIRecvBufAlloc_t* tail; + pthread_t tid; /* used to keep track of send or receive file descriptors */ + int pipe[2]; +} dragonFLIRecvHandle_t; + +#ifdef __cplusplus +} +#endif + +#endif \ No newline at end of file diff --git a/src/lib/bcast.c b/src/lib/bcast.c index c71d4b6..3d0870c 100644 --- a/src/lib/bcast.c +++ b/src/lib/bcast.c @@ -132,7 +132,7 @@ _bcast_init_obj(void* obj_ptr, size_t alloc_sz, size_t max_payload_sz, size_t ma size_t diff = (((void*)header->payload_area) + max_payload_sz) - obj_ptr; if (diff > alloc_sz) { char err_str[300]; - sprintf((char*)&err_str, "The provided size was %lu bytes and the required size was %lu bytes.\nThere is not enough room to allocate the requested bcast object.", alloc_sz, diff); + snprintf(err_str, 299, "The provided size was %lu bytes and the required size was %lu bytes.\nThere is not enough room to allocate the requested bcast object.", alloc_sz, diff); err_return(DRAGON_INVALID_ARGUMENT, err_str); } @@ -1487,6 +1487,8 @@ dragon_bcast_notify_signal(dragonBCastDescr_t* bd, const dragonWaitMode_t wait_m /* signal notification is requested. */ dragonBCastSignalArg_t* arg = malloc(sizeof(dragonBCastSignalArg_t)); + if (arg == NULL) + err_return (DRAGON_INTERNAL_MALLOC_FAIL, "Could not allocate space for malloc'ed thread argument."); if (payload_sz == NULL) err_return(DRAGON_INVALID_ARGUMENT, "The BCast notify payload_sz argument cannot be NULL when signal notification is requested."); @@ -1646,6 +1648,9 @@ dragon_bcast_trigger_one(dragonBCastDescr_t* bd, const timespec_t* timer, const dragonError_t err; err = dragon_bcast_trigger_some(bd, 1, timer, payload, payload_sz); + if (err == DRAGON_BCAST_NO_WAITERS) + no_err_return(DRAGON_BCAST_NO_WAITERS); + if (err != DRAGON_SUCCESS) append_err_return(err, "Call to trigger some of 1 did not succeed."); @@ -1861,8 +1866,14 @@ dragon_bcast_trigger_all(dragonBCastDescr_t* bd, const timespec_t* timer, const err = dragon_bcast_trigger_some(bd, INT_MAX, timer, payload, payload_sz); - if (err != DRAGON_SUCCESS) - err_return(err, "Call to trigger some with INT_MAX failed."); + if (err == DRAGON_BCAST_NO_WAITERS) + no_err_return(DRAGON_BCAST_NO_WAITERS); + + if (err != DRAGON_SUCCESS) { + char err_str[200]; + snprintf(err_str, 199, "Call to trigger some with INT_MAX failed with %s.", dragon_get_rc_string(err)); + err_return(err, err_str); + } no_err_return(DRAGON_SUCCESS); } diff --git a/src/lib/channels.c b/src/lib/channels.c index 46eae46..d006673 100644 --- a/src/lib/channels.c +++ b/src/lib/channels.c @@ -16,6 +16,8 @@ static dragonList_t* dg_gateways = NULL; /* used to verify header assignment in channels */ static bool _header_checked = false; +static const int dg_num_gateway_types = 3; + /* below are macros that cover locking and unlocking the UT and OT locks with * error handling */ #define _obtain_ut_lock(channel) \ @@ -1956,7 +1958,7 @@ _channel_is_masquerading(const dragonChannelDescr_t* ch) } static dragonError_t -_get_gw_idx(const dragonChannelDescr_t *ch, int *gw_idx) +_get_gw_idx(const dragonChannelDescr_t *ch, dragonChannelOpType_t op_type, int *gw_idx) { dragonULInt target_hostid; @@ -1964,7 +1966,34 @@ _get_gw_idx(const dragonChannelDescr_t *ch, int *gw_idx) if (err != DRAGON_SUCCESS) append_err_return(err, "Failed to obtain hostid for target channel."); - *gw_idx = dragon_hash_ulint(target_hostid) % dg_num_gateways; + /* For tcp agents, there is always 1 gateway and the index is hence always 0. + * For hsta agents, there will be a multiple of dg_num_gateway_types gateways, + * and a group of dg_num_gateway_types for each nic in the multi-nic case. + * + * Example with 4 nics + * ------------------- + * + * send_msg get_msg poll send_msg get_msg poll send_msg get_msg poll send_msg get_msg poll + * \______/ \_____/ \__/ \______/ \_____/ \__/ \______/ \_____/ \__/ \______/ \_____/ \__/ + * gw 0 gw 1 gw 2 gw 3 gw 4 gw 5 gw 6 gw 7 gw 8 gw 9 gw 10 gw 11 + * \___________________/ \___________________/ \___________________/ \___________________/ + * nic 0 nic 1 nic 2 nic 3 + */ + if (dg_num_gateways == 1) { + *gw_idx = 0; + } else { + int num_gw_groups = dg_num_gateways / dg_num_gateway_types; + int my_gw_group = dragon_hash_ulint(target_hostid) % num_gw_groups; + *gw_idx = (dg_num_gateway_types * my_gw_group) + op_type; + } + + if (*gw_idx < 0 || dg_num_gateways <= *gw_idx) { + char err_str[100]; + snprintf(err_str, 99, + "Invalid gateway index: gateway idx=%d, num gateways=%d.", + *gw_idx, dg_num_gateways); + append_err_return(err, err_str); + } no_err_return(DRAGON_SUCCESS); } @@ -2165,7 +2194,7 @@ dragon_channel_create(dragonChannelDescr_t* ch, const dragonC_UID_t c_uid, /* the memory pool must be locally addressable */ if (!dragon_memory_pool_is_local(pool_descr)) - append_err_return(DRAGON_INVALID_ARGUMENT, "cannot directly access memory pool for channel creation"); + err_return(DRAGON_INVALID_ARGUMENT, "cannot directly access memory pool for channel creation"); /* if the attrs are NULL populate a default one */ dragonChannelAttr_t def_attr; @@ -2173,7 +2202,7 @@ dragon_channel_create(dragonChannelDescr_t* ch, const dragonC_UID_t c_uid, err = dragon_channel_attr_init(&def_attr); if (err != DRAGON_SUCCESS) - append_err_return(err, "Could not intialize channel attributes."); + append_err_return(err, "Could not initialize channel attributes."); attr = &def_attr; } else { @@ -2396,7 +2425,7 @@ dragon_channel_serialize(const dragonChannelDescr_t* ch, dragonChannelSerial_t* /** * @brief Free the internal resources of a serialized channel descriptor * - * This frees internal structures of a serialized memory descriptor. It does not + * This frees internal structures of a serialized channel descriptor. It does not * destroy the channel itself. * * @param ch_ser is a serialized channel descriptor. @@ -2652,6 +2681,32 @@ dragon_channel_detach(dragonChannelDescr_t* ch) no_err_return(DRAGON_SUCCESS); } +/** + * @brief Clone a channel descriptor + * + * Calling this will copy a channel descriptor from one location to another. This does not + * copy the channel. It is used only for making a copy of a channel descriptor inside a process. + * + * @param newch_descr is the channel descriptor space to copy into. + * + * @param oldch_descr is the existing descriptor to clone. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + */ +dragonError_t +dragon_channel_descr_clone(dragonChannelDescr_t * newch_descr, const dragonChannelDescr_t * oldch_descr) +{ + if (oldch_descr == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Cannot clone from NULL descriptor."); + + if (newch_descr == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Cannot clone to NULL descriptor."); + + *newch_descr = *oldch_descr; + + no_err_return(DRAGON_SUCCESS); +} + /** @} */ // end of group. /** @defgroup channels_functionality Channels Functions @@ -2872,7 +2927,7 @@ dragon_channel_sendh(const dragonChannelDescr_t* ch, dragonChannelSendh_t* ch_sh int gw_idx; - err = _get_gw_idx(&ch_sh->_ch, &gw_idx); + err = _get_gw_idx(&ch_sh->_ch, DRAGON_OP_TYPE_SEND_MSG, &gw_idx); if (err != DRAGON_SUCCESS) append_err_return(err, "Could not get a gateway index."); @@ -3049,7 +3104,7 @@ dragon_channel_recvh(const dragonChannelDescr_t* ch, dragonChannelRecvh_t* ch_rh int gw_idx; - err = _get_gw_idx(&ch_rh->_ch, &gw_idx); + err = _get_gw_idx(&ch_rh->_ch, DRAGON_OP_TYPE_GET_MSG, &gw_idx); if (err != DRAGON_SUCCESS) append_err_return(err, "Could not get a gateway index."); @@ -4052,7 +4107,7 @@ dragon_channel_poll(const dragonChannelDescr_t* ch, dragonWaitMode_t wait_mode, dragonMemoryDescr_t req_mem; int gw_idx; - err = _get_gw_idx(ch, &gw_idx); + err = _get_gw_idx(ch, DRAGON_OP_TYPE_POLL, &gw_idx); if (err != DRAGON_SUCCESS) append_err_return(err, "Could not get a gateway index."); @@ -4378,6 +4433,8 @@ dragon_channel_register_gateways_from_env() append_err_return(DRAGON_INVALID_ARGUMENT, err_str); } + dragon_channel_serial_free(&gw_ser); + err = dragon_channel_register_gateway(&gw_ch); if (err != DRAGON_SUCCESS) { snprintf(err_str, 400, diff --git a/src/lib/channels_messages.c b/src/lib/channels_messages.c index 005db0d..2b782b3 100644 --- a/src/lib/channels_messages.c +++ b/src/lib/channels_messages.c @@ -7,7 +7,7 @@ #define DRAGON_CHANNEL_GWHEADER_NULINTS ((sizeof(dragonGatewayMessageHeader_t)/sizeof(dragonULInt*))-1) //#define DRAGON_CHANNEL_EXTRA_CHECKS -static timespec_t TRANSPORT_PATIENCE_ON_CLIENT_COMPLETE = {0,100000}; +static timespec_t TRANSPORT_PATIENCE_ON_CLIENT_COMPLETE = {0,1000000}; static dragonError_t _gateway_message_bcast_size(size_t payload_sz, size_t * bcast_nbytes) diff --git a/src/lib/err.h b/src/lib/err.h index 3b443b5..25d6d2e 100644 --- a/src/lib/err.h +++ b/src/lib/err.h @@ -20,6 +20,7 @@ char * dragon_getlasterrstr(); /* This can be modified during debugging if desired */ #define no_err_return(err) ({\ + _set_errstr(NULL);\ return err;\ }) diff --git a/src/lib/fli.c b/src/lib/fli.c new file mode 100644 index 0000000..99b03c9 --- /dev/null +++ b/src/lib/fli.c @@ -0,0 +1,1831 @@ +#include +#include "_fli.h" +#include "err.h" +#include "umap.h" +#include +#include +#include +#include +#include +#include +#include +#include + +static dragonMap_t* dg_fli_adapters = NULL; +static dragonMap_t* dg_fli_send_handles = NULL; +static dragonMap_t* dg_fli_recv_handles = NULL; + +#define DEFAULT_CHUNK_SIZE 1024 + +/* Used in the File Descriptor Sender Thread */ +typedef struct _SenderArg_st { + dragonFLISendHandleDescr_t* sendh; + int fd; + uint64_t user_arg; + size_t chunk_size; + bool buffer; +} _SenderArg_t; + +/* Used in the File Descriptor Receiver Thread */ +typedef struct _ReceiverArg_st { + dragonFLIRecvHandleDescr_t* recvh; + int fd; +} _ReceiverArg_t; + +/* obtain an fli structure from a given adapter descriptor */ +static dragonError_t +_fli_from_descr(const dragonFLIDescr_t* adapter, dragonFLI_t** fli) +{ + if (adapter == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli adapter descriptor"); + + /* find the entry in our pool map for this descriptor */ + dragonError_t err = dragon_umap_getitem(dg_fli_adapters, adapter->_idx, (void*)fli); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Failed to find item in fli adapters map"); + + no_err_return(DRAGON_SUCCESS); +} + +/* obtain an fli structure from a given send handle descriptor */ +static dragonError_t +_fli_sendh_from_descr(const dragonFLISendHandleDescr_t* send_descr, dragonFLISendHandle_t** send_handle) +{ + if (send_descr == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli send handle descriptor"); + + /* find the entry in our pool map for this descriptor */ + dragonError_t err = dragon_umap_getitem(dg_fli_send_handles, send_descr->_idx, (void*)send_handle); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Failed to find item in fli send handles map"); + + no_err_return(DRAGON_SUCCESS); +} + +/* obtain an fli structure from a given recv handle descriptor */ +static dragonError_t +_fli_recvh_from_descr(const dragonFLIRecvHandleDescr_t* recv_descr, dragonFLIRecvHandle_t** recv_handle) +{ + if (recv_descr == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli recv handle descriptor"); + + /* find the entry in our pool map for this descriptor */ + dragonError_t err = dragon_umap_getitem(dg_fli_recv_handles, recv_descr->_idx, (void*)recv_handle); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Failed to find item in fli recv handles map"); + + no_err_return(DRAGON_SUCCESS); +} + +/* insert an fli structure into the unordered map using the adapter->_idx as the key */ +static dragonError_t +_add_umap_fli_entry(dragonFLIDescr_t* adapter, const dragonFLI_t* fli) +{ + dragonError_t err; + + /* register this channel in our umap */ + if (dg_fli_adapters == NULL) { + /* this is a process-global variable and has no specific call to be + * destroyed */ + dg_fli_adapters = malloc(sizeof(dragonMap_t)); + if (dg_fli_adapters == NULL) + err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Cannot allocate umap for fli adapters"); + + err = dragon_umap_create(dg_fli_adapters, DRAGON_FLI_UMAP_SEED); + if (err != DRAGON_SUCCESS) + append_err_return(err, "failed to create umap for fli adapters"); + } + + err = dragon_umap_additem_genkey(dg_fli_adapters, (void*)fli, &adapter->_idx); + if (err != DRAGON_SUCCESS) + append_err_return(err, "failed to insert item into fli adapters umap"); + + no_err_return(DRAGON_SUCCESS); +} + +/* insert an fli send handle structure into the unordered map using the send_descr->_idx as the key */ +static dragonError_t +_add_umap_fli_sendh_entry(dragonFLISendHandleDescr_t* send_descr, const dragonFLISendHandle_t* send_handle) +{ + dragonError_t err; + + /* register this channel in our umap */ + if (dg_fli_send_handles == NULL) { + /* this is a process-global variable and has no specific call to be + * destroyed */ + dg_fli_send_handles = malloc(sizeof(dragonMap_t)); + if (dg_fli_send_handles == NULL) + err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Cannot allocate umap for fli send handles"); + + err = dragon_umap_create(dg_fli_send_handles, DRAGON_FLI_UMAP_SEED); + if (err != DRAGON_SUCCESS) + append_err_return(err, "failed to create umap for fli send handles"); + } + + err = dragon_umap_additem_genkey(dg_fli_send_handles, (void*)send_handle, &send_descr->_idx); + if (err != DRAGON_SUCCESS) + append_err_return(err, "failed to insert item into fli send handles umap"); + + no_err_return(DRAGON_SUCCESS); +} + +/* insert an fli recv handle structure into the unordered map using the recv_descr->_idx as the key */ +static dragonError_t +_add_umap_fli_recvh_entry(dragonFLIRecvHandleDescr_t* recv_descr, const dragonFLIRecvHandle_t* recv_handle) +{ + dragonError_t err; + + /* register this channel in our umap */ + if (dg_fli_recv_handles == NULL) { + /* this is a process-global variable and has no specific call to be + * destroyed */ + dg_fli_recv_handles = malloc(sizeof(dragonMap_t)); + if (dg_fli_recv_handles == NULL) + err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Cannot allocate umap for fli recv handles"); + + err = dragon_umap_create(dg_fli_recv_handles, DRAGON_FLI_UMAP_SEED); + if (err != DRAGON_SUCCESS) + append_err_return(err, "failed to create umap for fli recv handles"); + } + + err = dragon_umap_additem_genkey(dg_fli_recv_handles, (void*)recv_handle, &recv_descr->_idx); + if (err != DRAGON_SUCCESS) + append_err_return(err, "failed to insert item into fli recv handles umap"); + + no_err_return(DRAGON_SUCCESS); +} + + +static dragonError_t +_validate_attr(const dragonFLIAttr_t* attr) +{ + return DRAGON_NOT_IMPLEMENTED; +} + +static dragonError_t +_send_mem(dragonChannelSendh_t* sendh, dragonMemoryDescr_t* mem, uint64_t arg, timespec_t* deadline) +{ + dragonError_t err; + timespec_t remaining_time; + timespec_t* timeout = NULL; + dragonMessage_t msg; + dragonMessageAttr_t msg_attrs; + + if (sendh == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "You must provide a channel send handle to send a message."); + + if (mem == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "You must provide a memory descriptor."); + + if (deadline != NULL) { + timeout = &remaining_time; + err = dragon_timespec_remaining(deadline, timeout); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not compute remaining time"); + } + + err = dragon_channel_message_attr_init(&msg_attrs); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Failed to init message attr structure."); + + msg_attrs.hints = arg; + + err = dragon_channel_message_init(&msg, mem, &msg_attrs); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not initialize serialized stream channel message."); + + err = dragon_chsend_send_msg(sendh, &msg, DRAGON_CHANNEL_SEND_TRANSFER_OWNERSHIP, timeout); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not add serialized stream channel to manager channel."); + + err = dragon_channel_message_destroy(&msg, false); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not destroy message."); + + err = dragon_channel_message_attr_destroy(&msg_attrs); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not destroy message attributes."); + + no_err_return(DRAGON_SUCCESS); +} + +static dragonError_t +_send_bytes(dragonChannelSendh_t* chan_sendh, dragonMemoryPoolDescr_t* pool, uint8_t* bytes, size_t num_bytes, uint64_t arg, timespec_t* deadline) +{ + dragonError_t err; + dragonMemoryDescr_t mem_descr; + void* mem_ptr; + + if (pool == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Cannot send bytes without a pool for allocations."); + + if (bytes == NULL && num_bytes != 0) + err_return(DRAGON_INVALID_ARGUMENT, "You must provide bytes when sending a non-zero number of bytes."); + + err = dragon_memory_alloc(&mem_descr, pool, num_bytes); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get shared memory for message data."); + + if (num_bytes > 0) { + err = dragon_memory_get_pointer(&mem_descr, &mem_ptr); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get pointer for shared memory."); + memcpy(mem_ptr, bytes, num_bytes); + } + + err = _send_mem(chan_sendh, &mem_descr, arg, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Error when calling internal _send_mem."); + + no_err_return(DRAGON_SUCCESS); +} + +static dragonError_t +_send_buffered_bytes(dragonFLISendHandle_t* sendh, timespec_t* deadline) +{ + dragonError_t err; + dragonMemoryDescr_t mem_descr; + void* mem_ptr; + void* dest_ptr; + dragonFLISendBufAlloc_t* node; + dragonFLISendBufAlloc_t* prev; + + + err = dragon_memory_alloc(&mem_descr, &sendh->adapter->pool, sendh->total_bytes); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get shared memory for message data."); + + err = dragon_memory_get_pointer(&mem_descr, &mem_ptr); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get pointer for shared memory."); + + dest_ptr = mem_ptr + sendh->total_bytes; + node = sendh->buffered_allocations; + + while (node != NULL) { + dest_ptr = dest_ptr - node->num_bytes; + memcpy(dest_ptr, node->data, node->num_bytes); + prev = node; + node = node->next; + free(prev->data); + free(prev); + } + + if (dest_ptr != mem_ptr) + err_return(DRAGON_INVALID_OPERATION, "There was an error while unbuffering data in send operation."); + + err = _send_mem(&sendh->chan_sendh, &mem_descr, sendh->buffered_arg, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Error when calling internal _send_mem."); + + sendh->buffered_allocations = NULL; + sendh->total_bytes = 0; + sendh->buffered_arg = 0; + + no_err_return(DRAGON_SUCCESS); +} + +static dragonError_t +_buffer_bytes(dragonFLISendHandle_t* sendh, uint8_t* bytes, size_t num_bytes, uint64_t arg) { + void* data_ptr; + dragonFLISendBufAlloc_t* node_ptr; + + if (sendh->buffered_allocations == NULL) + /* first write, so grab the user's meta data arg */ + sendh->buffered_arg = arg; + + if (num_bytes > 0) { + data_ptr = malloc(num_bytes); + if (data_ptr == NULL) + err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Could not allocate space to buffer data - out of memory."); + + node_ptr = malloc(sizeof(dragonFLISendBufAlloc_t)); + if (node_ptr == NULL) + err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Could not allocate space for buffering data - out of memory."); + + memcpy(data_ptr, bytes, num_bytes); + + node_ptr->data = data_ptr; + node_ptr->num_bytes = num_bytes; + sendh->total_bytes+=num_bytes; + node_ptr->next = sendh->buffered_allocations; + sendh->buffered_allocations = node_ptr; + } + + no_err_return(DRAGON_SUCCESS); +} + +static dragonError_t +_recv_mem(dragonChannelRecvh_t* recvh, dragonMemoryDescr_t* mem, uint64_t* arg, timespec_t* deadline) +{ + dragonError_t err; + dragonMessage_t msg; + dragonMessageAttr_t attrs; + + if (recvh == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Must provide non-null receive handle."); + + if (mem == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Must provide non-null memory descriptor"); + + if (arg == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Must provide a non-null arg variable pointer."); + + err = dragon_channel_message_init(&msg, NULL, NULL); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not initialize message structure."); + + err = dragon_chrecv_get_msg_blocking(recvh, &msg, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not receive memory from channel."); + + err = dragon_channel_message_getattr(&msg, &attrs); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get message attributes from received messsage."); + + *arg = attrs.hints; + + err = dragon_channel_message_get_mem(&msg, mem); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get memory for stream channel."); + + err = dragon_channel_message_destroy(&msg, false); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not destroy message structure."); + + no_err_return(DRAGON_SUCCESS); +} + +static dragonError_t +_recv_bytes_into(dragonChannelRecvh_t* recvh, uint8_t** data, size_t* num_bytes, uint64_t* arg, timespec_t* deadline) +{ + dragonError_t err; + dragonMemoryDescr_t mem; + void* mem_ptr; + + if (data == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "You must provide a non-null data pointer address"); + + if (num_bytes == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "You must provide a non-null size_t pointer"); + + err = _recv_mem(recvh, &mem, arg, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Failed to receive message in _recv_mem."); + + err = dragon_memory_get_size(&mem, num_bytes); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get memory size for stream channel."); + + if (*num_bytes > 0) { + err = dragon_memory_get_pointer(&mem, &mem_ptr); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get memory pointer for stream channel."); + + if (*data == NULL) { + /* If a NULL pointer is passed for the location of the data, then we initialize it to + point to a freshly malloced space. In this case we can make it for the required size + exactly. */ + *data = malloc(*num_bytes); + if (*data == NULL) + err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Could not malloc memory for message."); + } + + memcpy(*data, mem_ptr, *num_bytes); + } else + *data = NULL; + + no_err_return(DRAGON_SUCCESS); +} + +static dragonError_t +_recv_bytes_buffered(dragonFLIRecvHandle_t* recvh, size_t requested_size, size_t* received_size, uint8_t** data, uint64_t* arg, timespec_t* deadline) +{ + dragonError_t err = DRAGON_SUCCESS; + void* src_ptr = NULL; + void* dest_ptr = NULL; + size_t cpy_bytes = 0; + dragonFLIRecvBufAlloc_t* node = NULL; + dragonFLIRecvBufAlloc_t* prev = NULL; + dragonMemoryDescr_t chunk_mem; + size_t chunk_size = 0; + uint64_t chunk_arg = 0; + size_t alloc_sz = 0; + size_t node_bytes = 0; + + /* Init return data to default values. */ + *received_size = 0; + *arg = 0; + + if (requested_size == 0 && recvh->buffered_bytes == 0) { + err = _recv_bytes_into(&recvh->chan_recvh, data, received_size, arg, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not receive bytes in helper routine."); + + no_err_return(DRAGON_SUCCESS); + } + + if (!recvh->EOT_received) { + while (chunk_arg != FLI_EOT && recvh->buffered_bytes < requested_size) { + err = _recv_mem(&recvh->chan_recvh, &chunk_mem, &chunk_arg, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get data and buffer it in file-like adapter."); + + err = dragon_memory_get_size(&chunk_mem, &chunk_size); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get chunk size while buffering data."); + + if (chunk_size > 0) { + node = malloc(sizeof(dragonFLIRecvBufAlloc_t)); + if (node == NULL) + err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Could not allocate node for buffered data."); + err = dragon_memory_descr_clone(&node->mem, &chunk_mem, 0, NULL); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Unable to clone mem descriptor while buffering data."); + node->num_bytes = chunk_size; + node->offset = 0; + node->arg = chunk_arg; + node->next = NULL; + recvh->buffered_bytes += chunk_size; + recvh->tail->next = node; + recvh->tail = node; + } + else { + if (chunk_arg == FLI_EOT) + recvh->EOT_received = true; + + /* We have a zero sized memory descriptor to free. */ + err = dragon_memory_free(&chunk_mem); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not free zero sized memory chunk while buffering data."); + } + } + } + + /* We set the size to the minimum of what was asked for or what is available. */ + alloc_sz = requested_size; + if (recvh->buffered_bytes < requested_size) + alloc_sz = recvh->buffered_bytes; + + /* Now check if there is any data left to return. */ + if (alloc_sz == 0 && recvh->EOT_received) { + *arg = FLI_EOT; + no_err_return(DRAGON_SUCCESS); + } + + if (alloc_sz == 0) + err_return(DRAGON_INVALID_OPERATION, "There is an internal failure"); + + if (*data == NULL) { + /* If NULL is passed in, then we allocate the space here for the received data. Otherwise, + the space was provided. */ + *data = malloc(alloc_sz); + if (*data == NULL) { + err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Could not allocate space for received data."); + } + } + + dest_ptr = *data; + prev = recvh->buffered_data; + node = prev->next; + + while (*received_size < alloc_sz) { + cpy_bytes = alloc_sz - *received_size; + node_bytes = node->num_bytes - node->offset; + if (node_bytes < cpy_bytes) + cpy_bytes = node_bytes; + + err = dragon_memory_get_pointer(&node->mem, &src_ptr); + src_ptr += node->offset; + + memcpy(dest_ptr, src_ptr, cpy_bytes); + + node->offset = node->offset + cpy_bytes; + *received_size += cpy_bytes; + dest_ptr += cpy_bytes; + recvh->buffered_bytes -= cpy_bytes; + *arg = node->arg; + + /* Check if we have used all data in this node */ + if (node->num_bytes == node->offset) { + if (node == recvh->tail) + recvh->tail = recvh->buffered_data; + prev->next = node->next; + err = dragon_memory_free(&node->mem); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not free buffered managed memory."); + free(node); + node = prev->next; + } + } + + no_err_return(DRAGON_SUCCESS); +} + +static dragonError_t +_recv_bytes_common(dragonFLIRecvHandleDescr_t* recv_handle, size_t requested_size, + size_t* received_size, uint8_t** bytes, uint64_t* arg, const timespec_t* timeout) +{ + dragonError_t err; + dragonFLIRecvHandle_t* recvh_obj; + timespec_t* deadline = NULL; + timespec_t end_time; + + if (arg == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "You must provide a pointer to a variable for the received arg metadata."); + + if (timeout != NULL) { + deadline = &end_time; + err = dragon_timespec_deadline(timeout, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not compute timeout deadline."); + } + + err = _fli_recvh_from_descr(recv_handle, &recvh_obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not resolve receive handle to internal fli receive handle object"); + + if (recvh_obj->stream_received) + /* data has been read already so we return our end of stream error code. */ + err_return(DRAGON_EOT, "End of Stream. You must close and re-open receive handle."); + + err = _fli_recvh_from_descr(recv_handle, &recvh_obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not resolve receive handle to internal fli receive handle object"); + + err = _recv_bytes_buffered(recvh_obj, requested_size, received_size, bytes, arg, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Error occurred while receiving data."); + + recvh_obj->num_bytes_received += *received_size; + + if (*received_size == 0 && *arg == FLI_EOT) { + recvh_obj->stream_received = true; + *arg = 0; /* FLI_EOT is internal only so don't expose it. */ + no_err_return(DRAGON_EOT); + } + + if (recvh_obj->adapter->use_buffered_protocol) + /* When buffered, mark stream as received after first read. */ + recvh_obj->stream_received = true; + + no_err_return(DRAGON_SUCCESS); +} + +static dragonError_t +_send_stream_channel(const dragonChannelDescr_t* strm_ch, const dragonChannelDescr_t* to_chan, dragonMemoryPoolDescr_t* pool, timespec_t* deadline) +{ + dragonError_t err; + dragonChannelSerial_t ser; + dragonChannelSendh_t sendh; + + if (pool == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The pool cannot be NULL."); + + if (strm_ch == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The stream channel descriptor cannot be NULL."); + + if (to_chan == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The channel to send to cannot be NULL."); + + err = dragon_channel_sendh(to_chan, &sendh, NULL); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not initialize send handle"); + + err = dragon_chsend_open(&sendh); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not open send handle on channel."); + + err = dragon_channel_serialize(strm_ch, &ser); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not serialize stream channel."); + + err = _send_bytes(&sendh, pool, ser.data, ser.len, 0, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not send stream channel."); + + err = dragon_channel_serial_free(&ser); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not free serialized channel structure."); + + err = dragon_chsend_close(&sendh); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not close send handle."); + + no_err_return(DRAGON_SUCCESS); +} + +static dragonError_t +_recv_stream_channel(dragonChannelDescr_t* from_chan, dragonChannelDescr_t* strm_ch, timespec_t* deadline) +{ + dragonError_t err; + dragonChannelSerial_t ser; + dragonChannelRecvh_t recvh; + uint64_t arg; + + if (strm_ch == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The stream channel descriptor cannot be NULL."); + + if (from_chan == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The channel to receive from cannot be NULL."); + + err = dragon_channel_recvh(from_chan, &recvh, NULL); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not initialize receive handle."); + + err = dragon_chrecv_open(&recvh); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not open send handle on channel."); + + /* We zero the pointer so the _recv_bytes_into will allocate space for us. */ + ser.data = NULL; + + err = _recv_bytes_into(&recvh, &ser.data, &ser.len, &arg, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not receive the stream channel."); + + err = dragon_channel_attach(&ser, strm_ch); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not attach stream channel"); + + err = dragon_channel_serial_free(&ser); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not free stream channel serialized descriptor."); + + err = dragon_chrecv_close(&recvh); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not close receive handle."); + + no_err_return(DRAGON_SUCCESS); +} + +static dragonError_t +_empty_the_channel(dragonChannelDescr_t* channel) +{ + dragonError_t err; + dragonChannelRecvh_t recvh; + timespec_t deadline = {0,0}; + + err = dragon_channel_recvh(channel, &recvh, NULL); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not create receive handle on channel."); + + err = dragon_chrecv_open(&recvh); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not open receive handle on channel."); + + err = DRAGON_SUCCESS; + + while (err == DRAGON_SUCCESS) { + dragonMemoryDescr_t mem; + uint64_t arg; + + err = _recv_mem(&recvh, &mem, &arg, &deadline); + + if (err == DRAGON_SUCCESS) { + err = dragon_memory_free(&mem); + } + } + + if (err != DRAGON_CHANNEL_EMPTY) + append_err_return(err, "There was an error emptying a channel in the fli adapter."); + + err = dragon_chrecv_close(&recvh); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not close receive handle on channel being emptied."); + + no_err_return(DRAGON_SUCCESS); +} + + +static void* +_from_fd_to_fli (void* ptr) +{ + dragonError_t err; + uint8_t* buffer; + size_t num_bytes = 0; + _SenderArg_t* arg = (_SenderArg_t*) ptr; + int fd = arg->fd; + uint64_t user_arg = arg->user_arg; + + buffer = malloc(arg->chunk_size); + if (buffer == NULL) { + err = DRAGON_INTERNAL_MALLOC_FAIL; + /* err might be logged eventually. */ + fprintf(stderr, "ERROR: The chunk size of %lu could not be allocated for sending (ERR=%s).", arg->chunk_size, dragon_get_rc_string(err)); + return NULL; + } + + err = DRAGON_SUCCESS; + while ((err == DRAGON_SUCCESS) && ((num_bytes = read(arg->fd, buffer, arg->chunk_size)) > 0)) + err = dragon_fli_send_bytes(arg->sendh, num_bytes, buffer, user_arg, arg->buffer, NULL); + + + if (err != DRAGON_SUCCESS) { + /* err might be logged eventually. But no way to return the error to user. + They will see a problem with the file descriptor. */ + fprintf(stderr, "ERROR: There was an error sending bytes through the fli interface (ERR=%s).\n", dragon_get_rc_string(err)); + } + + close(fd); + free(buffer); + + if (arg->buffer) { + err = dragon_fli_send_bytes(arg->sendh, 0, NULL, 0, false, NULL); + if (err != DRAGON_SUCCESS) + fprintf(stderr, "ERROR: Could not flush the buffered bytes from the file descriptor thread helper."); + } + + pthread_exit(NULL); +} + + +static void* +_from_fli_to_fd (void* ptr) +{ + dragonError_t err; + uint8_t* buffer; + uint64_t recv_arg; + size_t num_bytes = 0; + ssize_t written_bytes = 0; + _ReceiverArg_t* arg = (_ReceiverArg_t*) ptr; + int fd = arg->fd; + + while ((err = dragon_fli_recv_bytes(arg->recvh, 0, &num_bytes, &buffer, &recv_arg, NULL)) == DRAGON_SUCCESS) { + written_bytes = 0; + while (written_bytes < num_bytes) + written_bytes += write(arg->fd, &buffer[written_bytes], num_bytes - written_bytes); + + free(buffer); + } + + if (err != DRAGON_EOT) { + /* err might be logged eventually. */ + fprintf(stderr, "ERROR: There was an error receiving data from the fli interface (ERR=%s).\n", dragon_get_rc_string(err)); + } + + close(fd); + pthread_exit(NULL); +} + +/****************************************************************************************/ +/* Beginning of user API */ +/****************************************************************************************/ + +dragonError_t +dragon_fli_attr_init(dragonFLIAttr_t* attr) +{ + attr->_placeholder = 0; + + return DRAGON_SUCCESS; +} + + +dragonError_t +dragon_fli_create(dragonFLIDescr_t* adapter, dragonChannelDescr_t* main_ch, + dragonChannelDescr_t* mgr_ch, dragonMemoryPoolDescr_t* pool, + const dragonULInt num_strm_chs, dragonChannelDescr_t** strm_channels, + const bool use_buffered_protocol, dragonFLIAttr_t* attrs) +{ + dragonError_t err; + dragonFLIAttr_t def_attr; + uint64_t msg_count; + timespec_t deadline = {0,0}; /* try once only */ + + if (adapter == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli adapter descriptor"); + + if (use_buffered_protocol) { + if (mgr_ch != NULL) + err_return(DRAGON_INVALID_ARGUMENT, "If using buffered protocol you may not provide a manager channel."); + + if (num_strm_chs > 0) + err_return(DRAGON_INVALID_ARGUMENT, "If using buffered protocol you may not provide stream channels."); + } + + /* the memory pool must be locally addressable if provided. */ + if (pool != NULL && !dragon_memory_pool_is_local(pool)) + err_return(DRAGON_INVALID_ARGUMENT, "Cannot directly access memory pool for fli adapter"); + + if (num_strm_chs > 0 && mgr_ch == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "If providing stream channels, you must provide a manager channel as well."); + + /* if the attrs are NULL populate a default one */ + if (attrs == NULL) { + err = dragon_fli_attr_init(&def_attr); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not initialize channel attributes."); + + attrs = &def_attr; + } else { + err = _validate_attr(attrs); + if (err != DRAGON_SUCCESS) + append_err_return(err, "FLI Attribute(s) are invalid."); + } + + /* this will be freed in the fli_destroy call */ + dragonFLI_t* obj = malloc(sizeof(dragonFLI_t)); + if (obj == NULL) + err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Cannot allocate new file-like interface adapter."); + + obj->attrs = *attrs; + + if (pool == NULL) { + /* We will attach to the default pool in this case. */ + err = dragon_memory_pool_attach_default(&obj->pool); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not attach to default pool."); + } else { + /* make a clone of the pool descriptor for use here */ + err = dragon_memory_pool_descr_clone(&obj->pool, pool); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Cannot clone pool descriptor"); + } + + obj->use_buffered_protocol = use_buffered_protocol; + + if (main_ch != NULL) { + err = dragon_channel_descr_clone(&obj->main_ch, main_ch); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Cannot clone main channel descriptor."); + + err = dragon_channel_message_count(&obj->main_ch, &msg_count); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get the main channel message count during creation."); + + if (msg_count > 0) + err_return(DRAGON_INVALID_ARGUMENT, "The main channel has items in it during adapter creation."); + + obj->has_main_ch = true; + } else + obj->has_main_ch = false; + + if (mgr_ch != NULL) { + err = dragon_channel_descr_clone(&obj->mgr_ch, mgr_ch); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Cannot clone manager channel descriptor."); + + err = dragon_channel_message_count(&obj->mgr_ch, &msg_count); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get the manager channel message count during creation."); + + if (msg_count > 0) + err_return(DRAGON_INVALID_ARGUMENT, "The manager channel has items in it during adapter creation."); + + obj->has_mgr_ch = true; + } else + obj->has_mgr_ch = false; + + obj->num_strm_chs = num_strm_chs; + + for (int idx=0; idxmgr_ch, &obj->pool, &deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not deposit stream channel into manager channel."); + } + + err = _add_umap_fli_entry(adapter, obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "failed to add umap entry for created adapter"); + + no_err_return(DRAGON_SUCCESS); +} + + +dragonError_t +dragon_fli_destroy(dragonFLIDescr_t* adapter) +{ + dragonError_t err; + dragonFLI_t* obj; + + if (adapter == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli adapter descriptor"); + + err = _fli_from_descr(adapter, &obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not resolve adapter to internal fli object"); + + if (obj->has_mgr_ch) { + err = _empty_the_channel(&obj->mgr_ch); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not empty the manager channel."); + } + + if (obj->has_main_ch) { + err = _empty_the_channel(&obj->main_ch); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not empty the main channel."); + } + + err = dragon_umap_delitem(dg_fli_adapters, adapter->_idx); + if (err != DRAGON_SUCCESS) + append_err_return(err, "failed to delete adapter from from adapters umap"); + + free(obj); + adapter->_idx = 0; + + no_err_return(DRAGON_SUCCESS); +} + + +dragonError_t +dragon_fli_serialize(const dragonFLIDescr_t* adapter, dragonFLISerial_t* serial) +{ + dragonError_t err; + dragonFLI_t* obj; + uint8_t adapter_type = 0; + dragonChannelSerial_t main_ch_ser; + dragonChannelSerial_t mgr_ch_ser; + void* ptr; + + if (adapter == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli adapter descriptor"); + + if (serial == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli serial descriptor"); + + serial->data = NULL; + serial->len = 0; + + err = _fli_from_descr(adapter, &obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not resolve adapter to internal fli object"); + + if (obj->has_main_ch) { + adapter_type+=FLI_HAS_MAIN_CHANNEL; + err = dragon_channel_serialize(&obj->main_ch, &main_ch_ser); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not serialize main channel of fli adapter."); + + serial->len+=main_ch_ser.len + sizeof(main_ch_ser.len); + } + + if (obj->has_mgr_ch) { + adapter_type+=FLI_HAS_MANAGER_CHANNEL; + err = dragon_channel_serialize(&obj->mgr_ch, &mgr_ch_ser); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not serialize manager channel of fli adapter."); + + serial->len+=mgr_ch_ser.len + sizeof(mgr_ch_ser.len); + } + + if (obj->use_buffered_protocol) + adapter_type+=FLI_USING_BUFFERED_PROTOCOL; + + /* Add the one byte for the adapter type. */ + serial->len += sizeof(uint8_t); + + ptr = malloc(serial->len); + if (ptr == NULL) + err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Could not malloc space for serialized descriptor"); + + serial->data = ptr; + memcpy(ptr, &adapter_type, sizeof(uint8_t)); + ptr+=sizeof(uint8_t); + + if (obj->has_main_ch) { + memcpy(ptr, &main_ch_ser.len, sizeof(main_ch_ser.len)); + ptr+=sizeof(main_ch_ser.len); + memcpy(ptr, main_ch_ser.data, main_ch_ser.len); + ptr+=main_ch_ser.len; + err = dragon_channel_serial_free(&main_ch_ser); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not free serialized descriptor for main channel"); + } + + if (obj->has_mgr_ch) { + memcpy(ptr, &mgr_ch_ser.len, sizeof(mgr_ch_ser.len)); + ptr+=sizeof(mgr_ch_ser.len); + memcpy(ptr, mgr_ch_ser.data, mgr_ch_ser.len); + ptr+=mgr_ch_ser.len; + err = dragon_channel_serial_free(&mgr_ch_ser); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not free serialized descriptor for manager channel"); + } + + no_err_return(DRAGON_SUCCESS); +} + + +dragonError_t +dragon_fli_serial_free(dragonFLISerial_t* serial) +{ + if (serial == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid serialized fli adapter."); + + if (serial->data == NULL) + no_err_return(DRAGON_SUCCESS); + + free(serial->data); + serial->data = NULL; + + no_err_return(DRAGON_SUCCESS); +} + + +dragonError_t +dragon_fli_attach(const dragonFLISerial_t* serial, const dragonMemoryPoolDescr_t* pool, + dragonFLIDescr_t* adapter) +{ + dragonError_t err; + dragonFLI_t* obj; + uint8_t adapter_type = 0; + dragonChannelSerial_t ch_ser; + void* ptr; + dragonFLIAttr_t* attrs = NULL; /* Perhaps this needs to be an argument. Or perhaps + attrs should be included in the serialized descriptor? */ + + if (serial == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid serialized fli adapter."); + + if (adapter == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli adapter."); + + /* if the attrs are NULL populate a default one */ + dragonFLIAttr_t def_attr; + if (attrs == NULL) { + + err = dragon_fli_attr_init(&def_attr); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not initialize channel attributes."); + + attrs = &def_attr; + } else { + + err = _validate_attr(attrs); + if (err != DRAGON_SUCCESS) + append_err_return(err, "FLI Attribute(s) are invalid."); + } + + /* this will be freed in the fli_destroy call */ + obj = malloc(sizeof(dragonFLI_t)); + if (obj == NULL) + err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Cannot allocate new file-like interface adapter for attaching."); + + obj->attrs = *attrs; + obj->num_strm_chs = 0; /* We don't keep track of it the channels in attached objects */ + + if (pool == NULL) { + /* We will attach to the default pool in this case. */ + err = dragon_memory_pool_attach_default(&obj->pool); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not attach to default pool."); + } else { + /* make a clone of the pool descriptor for use here */ + err = dragon_memory_pool_descr_clone(&obj->pool, pool); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Cannot clone pool descriptor"); + } + + ptr=serial->data; + memcpy(&adapter_type, ptr, sizeof(adapter_type)); + ptr+=sizeof(adapter_type); + + obj->use_buffered_protocol = (adapter_type & FLI_USING_BUFFERED_PROTOCOL) != 0; + obj->has_main_ch = (adapter_type & FLI_HAS_MAIN_CHANNEL) != 0; + obj->has_mgr_ch = (adapter_type & FLI_HAS_MANAGER_CHANNEL) != 0; + + if (obj->has_main_ch) { + memcpy(&ch_ser.len, ptr, sizeof(ch_ser.len)); + ptr+=sizeof(ch_ser.len); + ch_ser.data = ptr; + err = dragon_channel_attach(&ch_ser, &obj->main_ch); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Cannot attach to main channel of adapter."); + ptr+=ch_ser.len; + } + + if (obj->has_mgr_ch) { + memcpy(&ch_ser.len, ptr, sizeof(ch_ser.len)); + ptr+=sizeof(ch_ser.len); + ch_ser.data = ptr; + err = dragon_channel_attach(&ch_ser, &obj->mgr_ch); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Cannot attach to manager channel of adapter."); + ptr+=ch_ser.len; + } + + err = _add_umap_fli_entry(adapter, obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "failed to add umap entry for attached adapter"); + + no_err_return(DRAGON_SUCCESS); +} + + +dragonError_t +dragon_fli_detach(dragonFLIDescr_t* adapter) +{ + dragonError_t err; + dragonFLI_t* obj; + + if (adapter == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli adapter descriptor"); + + err = _fli_from_descr(adapter, &obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not resolve adapter to internal fli object"); + + err = dragon_umap_delitem(dg_fli_adapters, adapter->_idx); + if (err != DRAGON_SUCCESS) + append_err_return(err, "failed to delete adapter from from adapters umap"); + + free(obj); + adapter->_idx = 0; + + no_err_return(DRAGON_SUCCESS); +} + + +dragonError_t +dragon_fli_open_send_handle(const dragonFLIDescr_t* adapter, dragonFLISendHandleDescr_t* send_handle, + dragonChannelDescr_t* strm_ch, const timespec_t* timeout) +{ + dragonError_t err; + dragonFLI_t* obj; + dragonFLISendHandle_t* sendh_obj; + timespec_t* deadline = NULL; + timespec_t end_time; + + if (adapter == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli adapter descriptor"); + + if (send_handle == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli send handle descriptor"); + + if (timeout != NULL) { + deadline = &end_time; + err = dragon_timespec_deadline(timeout, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not compute timeout deadline."); + } + + err = _fli_from_descr(adapter, &obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not resolve adapter to internal fli object"); + + if (obj->use_buffered_protocol && strm_ch != NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Cannot supply a stream channel while using buffered protocol"); + + sendh_obj = malloc(sizeof(dragonFLISendHandle_t)); + + if (sendh_obj == NULL) + err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Could not allocate send handle."); + + sendh_obj->adapter = obj; + sendh_obj->buffered_allocations = NULL; + sendh_obj->total_bytes = 0; + sendh_obj->tid = 0; + + if (obj->use_buffered_protocol) { + /* The main channel send handle will be opened only when a buffered message is written, + which occurs when this send handle is closed.*/ + sendh_obj->user_supplied = false; + err = dragon_channel_sendh(&obj->main_ch, &sendh_obj->chan_sendh, NULL); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not create send handle on main channel."); + + err = dragon_chsend_open(&sendh_obj->chan_sendh); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not open send handle on main channel."); + + } else { + if (strm_ch == STREAM_CHANNEL_IS_MAIN_FOR_1_1_CONNECTION) { + if (!obj->has_main_ch) + err_return(DRAGON_INVALID_ARGUMENT, "The adapter needs a main channel when specifying to use main channel as stream channel."); + + if (obj->has_mgr_ch) + err_return(DRAGON_INVALID_ARGUMENT, "You cannot use 1:1 mode on the fli when there is a manager channel."); + + sendh_obj->user_supplied = false; + + err = dragon_channel_descr_clone(&sendh_obj->strm_channel, &obj->main_ch); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Cannot clone main channel descriptor as stream channel descriptor."); + } else if (strm_ch != NULL) { + /* This is a user-supplied channel so we keep track of that. */ + if (!obj->has_main_ch) + err_return(DRAGON_INVALID_ARGUMENT, "The adapter needs a main channel when a sender provided stream channel is given."); + + sendh_obj->user_supplied = true; + + err = dragon_channel_descr_clone(&sendh_obj->strm_channel, strm_ch); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Cannot clone stream channel descriptor."); + } else { + /* It is a manager supplied stream. */ + if (!obj->has_mgr_ch) + err_return(DRAGON_INVALID_ARGUMENT, "You must provide a stream channel when there is no manager channel."); + + /* We are using a stream channel from the manager channel. */ + sendh_obj->user_supplied = false; + + err = _recv_stream_channel(&obj->mgr_ch, &sendh_obj->strm_channel, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get stream channel from manager channel."); + + strm_ch = &sendh_obj->strm_channel; + } + + err = dragon_channel_sendh(&sendh_obj->strm_channel, &sendh_obj->chan_sendh, NULL); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not create send handle on stream channel."); + + err = dragon_chsend_open(&sendh_obj->chan_sendh); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not open send handle on stream channel."); + + if (obj->has_main_ch && strm_ch != STREAM_CHANNEL_IS_MAIN_FOR_1_1_CONNECTION) { + /* If it has no main channel, then the stream channel was receiver supplied. In + all other cases (when not buffered or not the special case of a 1:1 connection) + the stream channel is written into the main + channel so a receiver can receive it and start reading while writing is occurring. */ + err = _send_stream_channel(strm_ch, &obj->main_ch, &obj->pool, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not deposit stream channel into main channel."); + } + } + + err = _add_umap_fli_sendh_entry(send_handle, sendh_obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "failed to add umap entry for created send handle"); + + no_err_return(DRAGON_SUCCESS); +} + + +dragonError_t +dragon_fli_close_send_handle(dragonFLISendHandleDescr_t* send_handle, const timespec_t* timeout) +{ + dragonError_t err; + dragonFLISendHandle_t* sendh_obj; + timespec_t* deadline = NULL; + timespec_t end_time; + + if (send_handle == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli send handle descriptor"); + + if (timeout != NULL) { + deadline = &end_time; + err = dragon_timespec_deadline(timeout, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not compute timeout deadline."); + } + + err = _fli_sendh_from_descr(send_handle, &sendh_obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not resolve send handle to internal fli send handle object"); + + if (sendh_obj->tid != 0) + err_return(DRAGON_INVALID_OPERATION, "You must close the created file descriptor and call dragon_finalize_writable_fd first."); + + if (sendh_obj->buffered_allocations != NULL) { + /* buffered bytes remain to send on close of send handle. */ + err = _send_buffered_bytes(sendh_obj, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not send buffered data."); + } + + if (!sendh_obj->adapter->use_buffered_protocol) { + /* sending the EOT indicator for the stream. */ + err = _send_bytes(&sendh_obj->chan_sendh, &sendh_obj->adapter->pool, NULL, 0, FLI_EOT, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not send the end of stream indicator down the stream channel."); + } + + err = dragon_chsend_close(&sendh_obj->chan_sendh); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not close send handle on channel"); + + /* remove the item from the umap */ + err = dragon_umap_delitem(dg_fli_send_handles, send_handle->_idx); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Failed to delete item from fli send handle umap."); + + send_handle->_idx = 0; + + free(sendh_obj); + + no_err_return(DRAGON_SUCCESS); + +} + +dragonError_t +dragon_fli_open_recv_handle(const dragonFLIDescr_t* adapter, dragonFLIRecvHandleDescr_t* recv_handle, + dragonChannelDescr_t* strm_ch, const timespec_t* timeout) +{ + dragonError_t err; + dragonFLI_t* obj; + dragonFLIRecvHandle_t* recvh_obj; + timespec_t* deadline = NULL; + timespec_t end_time; + + if (adapter == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli adapter descriptor"); + + if (recv_handle == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli receive handle descriptor"); + + if (timeout != NULL) { + deadline = &end_time; + err = dragon_timespec_deadline(timeout, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not compute timeout deadline."); + } + + err = _fli_from_descr(adapter, &obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not resolve adapter to internal fli object"); + + if (obj->use_buffered_protocol && strm_ch != NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Cannot supply a stream channel while using buffered protocol"); + + recvh_obj = malloc(sizeof(dragonFLIRecvHandle_t)); + if (recvh_obj == NULL) + err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Could not allocate receive handle."); + + recvh_obj->adapter = obj; + recvh_obj->user_supplied = false; + recvh_obj->stream_received = false; + recvh_obj->EOT_received = false; + recvh_obj->num_bytes_received = 0; + recvh_obj->buffered_bytes = 0; + recvh_obj->tid = 0; + + /* Creating a dummy head node simplifies management of the linked list. */ + recvh_obj->buffered_data = malloc(sizeof(dragonFLIRecvBufAlloc_t)); + if (recvh_obj->buffered_data == NULL) + err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Could not malloc dummy node in receive handle."); + + recvh_obj->buffered_data->num_bytes = 0; + recvh_obj->buffered_data->next = NULL; + recvh_obj->tail = recvh_obj->buffered_data; + + if (obj->use_buffered_protocol) { + /* With buffered protocol we receive off of the main channel. */ + err = dragon_channel_recvh(&recvh_obj->adapter->main_ch, &recvh_obj->chan_recvh, NULL); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not create recv handle on stream channel."); + + err = dragon_chrecv_open(&recvh_obj->chan_recvh); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not open recv handle on stream channel."); + + } else { + if (strm_ch == STREAM_CHANNEL_IS_MAIN_FOR_1_1_CONNECTION) { + /* The main channel is used as the stream channel for a 1:1 connection */ + if (!obj->has_main_ch) + err_return(DRAGON_INVALID_ARGUMENT, "The adapter needs a main channel when a receiver specifies use main."); + + if (obj->has_mgr_ch) + err_return(DRAGON_INVALID_ARGUMENT, "You cannot use 1:1 mode on the fli when there is a manager channel."); + + recvh_obj->user_supplied = false ; + + err = dragon_channel_descr_clone(&recvh_obj->strm_channel, &obj->main_ch); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Cannot clone main channel as stream channel descriptor."); + + } else if (strm_ch != NULL) { + /* A user supplied stream channel will be used. */ + if (!obj->has_mgr_ch) + err_return(DRAGON_INVALID_ARGUMENT, "The adapter needs a manager channel when a receiver provided stream channel is given."); + + /* This is a user-supplied channel so we keep track of that. */ + recvh_obj->user_supplied = true; + + err = dragon_channel_descr_clone(&recvh_obj->strm_channel, strm_ch); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Cannot clone stream channel descriptor."); + + /* We add it to the manager channel so a sender can pick it up. */ + err = _send_stream_channel(strm_ch, &obj->mgr_ch, &obj->pool, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not deposit stream channel into manager channel."); + + } else { + /* A main channel supplied stream channel will be used */ + if (!obj->has_main_ch) + err_return(DRAGON_INVALID_ARGUMENT, "You must provide a stream channel when there is no main channel."); + + /* We are using a stream channel from the main channel. */ + err = _recv_stream_channel(&obj->main_ch, &recvh_obj->strm_channel, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get stream channel from manager channel."); + } + + err = dragon_channel_recvh(&recvh_obj->strm_channel, &recvh_obj->chan_recvh, NULL); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not create recv handle on stream channel."); + + err = dragon_chrecv_open(&recvh_obj->chan_recvh); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not open recv handle on stream channel."); + } + + err = _add_umap_fli_recvh_entry(recv_handle, recvh_obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "failed to add umap entry for created receive handle"); + + no_err_return(DRAGON_SUCCESS); +} + + +dragonError_t +dragon_fli_close_recv_handle(dragonFLIRecvHandleDescr_t* recv_handle, const timespec_t* timeout) +{ + dragonError_t err; + dragonFLIRecvHandle_t* recvh_obj; + timespec_t* deadline = NULL; + timespec_t end_time; + + if (recv_handle == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli receive handle descriptor"); + + if (timeout != NULL) { + deadline = &end_time; + err = dragon_timespec_deadline(timeout, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not compute timeout deadline."); + } + + err = _fli_recvh_from_descr(recv_handle, &recvh_obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not resolve receive handle to internal fli receive handle object"); + + if (recvh_obj->tid != 0) + err_return(DRAGON_INVALID_OPERATION, "You must close the created file descriptor and call dragon_finalize_readable_fd first."); + + err = dragon_chrecv_close(&recvh_obj->chan_recvh); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not close adapters stream channel receive handle."); + + /* We check here that the entire stream had been read. Otherwise we are not + done with the stream and we will be leaving partial data in the stream + which is not good since it would lead to unpredictable results on + future reads. */ + if (!recvh_obj->stream_received) + err_return(DRAGON_INVALID_OPERATION, "Cannot close receive handle with partially read stream."); + + if (!recvh_obj->user_supplied && !recvh_obj->adapter->use_buffered_protocol && recvh_obj->adapter->has_mgr_ch) { + /* We are not using the buffered protocol and stream channel must be + returned to the manager channel if there is one. */ + err = _send_stream_channel(&recvh_obj->strm_channel, &recvh_obj->adapter->mgr_ch, &recvh_obj->adapter->pool, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not return stream channel to manager channel in receive handle close of FLI adapter."); + } + + /* remove the item from the umap */ + err = dragon_umap_delitem(dg_fli_recv_handles, recv_handle->_idx); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Failed to delete item from fli receive handle umap."); + + recv_handle->_idx = 0; + free(recvh_obj); + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragon_fli_create_writable_fd(dragonFLISendHandleDescr_t* send_handle, int* fd_ptr, + const bool buffer, size_t chunk_size, + const uint64_t user_arg, const timespec_t* timeout) +{ + dragonError_t err; + dragonFLISendHandle_t* sendh_obj; + + if (send_handle == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The FLI send handle cannot be NULL."); + + if (fd_ptr == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The file descriptor pointer cannot be NULL."); + + if (chunk_size == 0) + chunk_size = DEFAULT_CHUNK_SIZE; + + err = _fli_sendh_from_descr(send_handle, &sendh_obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not resolve send handle to internal fli send handle object"); + + if (sendh_obj->tid != 0) + err_return(DRAGON_INVALID_OPERATION, "Cannot create a file descriptor when another is in use. Close and finalize first."); + + /* Create the pipe. */ + if (pipe(sendh_obj->pipe)) + err_return(DRAGON_FAILURE, "Could not create a pipe for the file descriptor open."); + + _SenderArg_t* arg = malloc(sizeof(_SenderArg_t)); + if (arg == NULL) + err_return (DRAGON_INTERNAL_MALLOC_FAIL, "Could not allocate space for malloc'ed thread argument."); + + arg->sendh = send_handle; + arg->user_arg = user_arg; + arg->chunk_size = chunk_size; + arg->buffer = buffer; + arg->fd = sendh_obj->pipe[0]; /* pass along the read end to the thread. */ + + int perr = pthread_create(&sendh_obj->tid, NULL, _from_fd_to_fli, arg); + + if (perr != 0) { + char err_str[200]; + sendh_obj->tid = 0; + snprintf(err_str, 199, "There was an error on the pthread_create call. ERR=%d", perr); + err_return(DRAGON_FAILURE, err_str); + } + + *fd_ptr = sendh_obj->pipe[1]; + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragon_fli_finalize_writable_fd(dragonFLISendHandleDescr_t* send_handle) +{ + dragonError_t err; + dragonFLISendHandle_t* sendh_obj; + + if (send_handle == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The FLI send handle cannot be NULL."); + + err = _fli_sendh_from_descr(send_handle, &sendh_obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not resolve send handle to internal fli send handle object"); + + if (sendh_obj->tid != 0) { + int perr; + void* retval; + + /* We must join here to prevent the receive handle from being destroyed before the thread + managing the file descriptor exits. */ + perr = pthread_join(sendh_obj->tid, &retval); + + if (perr != 0) { + char err_str[200]; + snprintf(err_str, 199, "There was an error on the pthread_join call while closing the send handle. ERR=%d", perr); + err_return(DRAGON_FAILURE, err_str); + } + sendh_obj->tid = 0; + } + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragon_fli_create_readable_fd(dragonFLIRecvHandleDescr_t* recv_handle, int* fd_ptr, const timespec_t* timeout) +{ + dragonError_t err; + dragonFLIRecvHandle_t* recvh_obj; + + if (recv_handle == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The FLI receive handle cannot be NULL."); + + if (fd_ptr == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The file descriptor pointer cannot be NULL."); + + err = _fli_recvh_from_descr(recv_handle, &recvh_obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not resolve receive handle to internal fli receive handle object"); + + if (recvh_obj->tid != 0) + err_return(DRAGON_INVALID_OPERATION, "Cannot create a file descriptor when another is in use. Close and finalize first."); + + /* Create the pipe. */ + if (pipe(recvh_obj->pipe)) { + err_return(DRAGON_FAILURE, "Could not create a pipe for the file descriptor open."); + } + + _ReceiverArg_t* arg = malloc(sizeof(_ReceiverArg_t)); + if (arg == NULL) + err_return (DRAGON_INTERNAL_MALLOC_FAIL, "Could not allocate space for malloc'ed thread argument."); + + arg->recvh = recv_handle; + arg->fd = recvh_obj->pipe[1]; /* pass along the write end to the thread. */ + + int perr = pthread_create(&recvh_obj->tid, NULL, _from_fli_to_fd, arg); + + if (perr != 0) { + char err_str[200]; + recvh_obj->tid = 0; + snprintf(err_str, 199, "There was an error on the pthread_create call. ERR=%d", perr); + err_return(DRAGON_FAILURE, err_str); + } + + *fd_ptr = recvh_obj->pipe[0]; /* Give the read end back to the caller. */ + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragon_fli_finalize_readable_fd(dragonFLIRecvHandleDescr_t* recv_handle) +{ + dragonError_t err; + dragonFLIRecvHandle_t* recvh_obj; + + if (recv_handle == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "The FLI receive handle cannot be NULL."); + + err = _fli_recvh_from_descr(recv_handle, &recvh_obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not resolve receive handle to internal fli receive handle object"); + + if (recvh_obj->tid != 0) { + int perr; + void* retval; + + /* We must join here to prevent the receive handle from being destroyed before the thread managing + the file descriptor exits. */ + perr = pthread_join(recvh_obj->tid, &retval); + + if (perr != 0) { + char err_str[200]; + snprintf(err_str, 199, "There was an error on the pthread_kill call while closing the receive handle. ERR=%d", perr); + err_return(DRAGON_FAILURE, err_str); + } + recvh_obj->tid = 0; + } + + no_err_return(DRAGON_SUCCESS); +} + +dragonError_t +dragon_fli_send_bytes(dragonFLISendHandleDescr_t* send_handle, size_t num_bytes, + uint8_t* bytes, uint64_t arg, const bool buffer, const timespec_t* timeout) +{ + dragonError_t err; + dragonFLISendHandle_t* sendh_obj; + timespec_t* deadline = NULL; + timespec_t end_time; + + if (send_handle == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli send handle descriptor"); + + if (bytes == NULL && num_bytes > 0) + err_return(DRAGON_INVALID_ARGUMENT, "Cannot send non-zer number of bytes with NULL pointer."); + + if (timeout != NULL) { + deadline = &end_time; + err = dragon_timespec_deadline(timeout, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not compute timeout deadline."); + } + + err = _fli_sendh_from_descr(send_handle, &sendh_obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not resolve send handle to internal fli send handle object"); + + /* buffering bytes to send */ + err = _buffer_bytes(sendh_obj, bytes, num_bytes, arg); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not buffer bytes."); + + if (!sendh_obj->adapter->use_buffered_protocol && !buffer) { + /* buffered bytes are sent when not buffered protocol and flushing was + requested (default behavior) */ + err = _send_buffered_bytes(sendh_obj, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not send data."); + } + + no_err_return(DRAGON_SUCCESS); +} + + +dragonError_t +dragon_fli_send_mem(dragonFLISendHandleDescr_t* send_handle, dragonMemoryDescr_t* mem, + uint64_t arg, const timespec_t* timeout) +{ + dragonError_t err; + dragonFLISendHandle_t* sendh_obj; + timespec_t* deadline = NULL; + timespec_t end_time; + + + if (send_handle == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid fli send handle descriptor"); + + if (mem == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "You must provide a valid memory descriptor pointer."); + + if (timeout != NULL) { + deadline = &end_time; + err = dragon_timespec_deadline(timeout, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not compute timeout deadline."); + } + + err = _fli_sendh_from_descr(send_handle, &sendh_obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not resolve send handle to internal fli send handle object"); + + if (sendh_obj->adapter->use_buffered_protocol) + err_return(DRAGON_INVALID_ARGUMENT, "You cannot use dragon_fli_send_mem on a buffered fli adapter. Use dragon_fli_send_bytes instead."); + + /* sending mem on stream channel */ + err = _send_mem(&sendh_obj->chan_sendh, mem, arg, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not send the managed memory down the stream channel."); + + no_err_return(DRAGON_SUCCESS); +} + + +dragonError_t +dragon_fli_recv_bytes_into(dragonFLIRecvHandleDescr_t* recv_handle, size_t requested_size, + size_t* received_size, uint8_t* bytes, uint64_t* arg, + const timespec_t* timeout) +{ + dragonError_t err; + uint8_t* buffer_ptr = bytes; + + if (bytes == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "You must provide a pointer to the allocated space for the received bytes."); + + err = _recv_bytes_common(recv_handle, requested_size, received_size, &buffer_ptr, arg, timeout); + if (err != DRAGON_SUCCESS && err != DRAGON_EOT) + append_err_return(err, "Could not receive bytes into."); + + no_err_return(err); +} + + +dragonError_t +dragon_fli_recv_bytes(dragonFLIRecvHandleDescr_t* recv_handle, size_t requested_size, + size_t* received_size, uint8_t** bytes, uint64_t* arg, + const timespec_t* timeout) +{ + dragonError_t err; + + if (bytes == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "You must provide a pointer to a pointer for the received bytes."); + + /* Initializing the pointer to NULL guarantees that the internal function will allocate space for the + received bytes. */ + *bytes = NULL; + + err = _recv_bytes_common(recv_handle, requested_size, received_size, bytes, arg, timeout); + if (err != DRAGON_SUCCESS && err != DRAGON_EOT) + append_err_return(err, "Could not receive bytes into."); + + no_err_return(err); +} + + +dragonError_t +dragon_fli_recv_mem(dragonFLIRecvHandleDescr_t* recv_handle, dragonMemoryDescr_t* mem, + uint64_t* arg, const timespec_t* timeout) + +{ + dragonError_t err; + dragonFLIRecvHandle_t* recvh_obj; + timespec_t* deadline = NULL; + timespec_t end_time; + size_t received_size; + + if (recv_handle == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Invalid FLI receive handle descriptor"); + + if (mem == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "You must provide a pointer to a memory descriptor."); + + if (arg == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "You must provide a pointer to a variable for the received arg metadata."); + + if (timeout != NULL) { + deadline = &end_time; + err = dragon_timespec_deadline(timeout, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not compute timeout deadline."); + } + + err = _fli_recvh_from_descr(recv_handle, &recvh_obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not resolve receive handle to internal fli receive handle object"); + + if (recvh_obj->stream_received) + /* data has been read already so we return our end of stream error code. */ + err_return(DRAGON_EOT, "End of Stream. You must close and re-open receive handle."); + + err = _fli_recvh_from_descr(recv_handle, &recvh_obj); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not resolve receive handle to internal fli receive handle object"); + + err = _recv_mem(&recvh_obj->chan_recvh, mem, arg, deadline); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Error occurred while receiving data."); + + err = dragon_memory_get_size(mem, &received_size); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not get size of received memory descriptor."); + + recvh_obj->num_bytes_received += received_size; + + if (received_size == 0 && *arg == FLI_EOT) { + recvh_obj->stream_received = true; + *arg = 0; /* FLI_EOT is internal only so don't expose it. */ + append_err_return(DRAGON_EOT, "Reached the end of stream"); + } + + if (recvh_obj->adapter->use_buffered_protocol) + /* When buffered, mark stream as received after first read. */ + recvh_obj->stream_received = true; + + no_err_return(DRAGON_SUCCESS); +} \ No newline at end of file diff --git a/src/lib/heap_manager.c b/src/lib/heap_manager.c index 99e5a31..44b7abb 100644 --- a/src/lib/heap_manager.c +++ b/src/lib/heap_manager.c @@ -950,7 +950,7 @@ dragonError_t dragon_heap_init(void* ptr, dragonDynHeap_t* heap, const size_t ma derr = dragon_heap_malloc(heap, size, &allocations[idx]); if (derr != DRAGON_SUCCESS) { char err_str[200]; - sprintf((char*)&err_str, "Could not satisfy preallocated block allocation request of size %lu", size); + snprintf(err_str, 199, "Could not satisfy preallocated block allocation request of size %lu", size); append_err_return(derr, err_str); } @@ -1233,7 +1233,7 @@ dragonError_t dragon_heap_malloc(dragonDynHeap_t* heap, const size_t size, void* // in the heap. *ptr = NULL; char err_str[200]; - sprintf((char*)&err_str, "Will never be able to satisfy dragon_heap_malloc request of size %lu. This may be because of static pre-allocations.", size); + snprintf(err_str, 199, "Will never be able to satisfy dragon_heap_malloc request of size %lu. This may be because of static pre-allocations.", size); err_return(DRAGON_DYNHEAP_REQUESTED_SIZE_TOO_LARGE, err_str); } diff --git a/src/lib/managed_memory.c b/src/lib/managed_memory.c index 481d858..8e7895a 100644 --- a/src/lib/managed_memory.c +++ b/src/lib/managed_memory.c @@ -38,6 +38,26 @@ static dragonMap_t * dg_mallocs = NULL; }\ }) +#define _maybe_obtain_manifest_lock(pool) ({\ + dragonError_t err = dragon_lock(&pool->mlock);\ + if (err != DRAGON_SUCCESS && err != DRAGON_OBJECT_DESTROYED) {\ + char * err_str = _errstr_with_code("manifest lock error code", (int)err);\ + err_noreturn(err_str);\ + free(err_str);\ + return err;\ + }\ +}) + +#define _maybe_release_manifest_lock(pool) ({\ + dragonError_t err = dragon_unlock(&pool->mlock);\ + if (err != DRAGON_SUCCESS && err != DRAGON_OBJECT_DESTROYED) {\ + char * err_str = _errstr_with_code("manifest unlock error code", (int)err);\ + err_noreturn(err_str);\ + free(err_str);\ + return err;\ + }\ +}) + static void _find_pow2_requirement(size_t v, size_t * nv, uint32_t * power) { @@ -475,10 +495,24 @@ _alloc_pool_shm(dragonMemoryPool_t * pool, const char * base_name, dragonMemoryP no_err_return(DRAGON_SUCCESS); } +static bool +_pool_is_destroyed(dragonMemoryPool_t * pool) +{ + return dragon_lock_is_valid(&pool->mlock) != true; +} + static dragonError_t _free_pool_shm(dragonMemoryPool_t * pool, dragonMemoryPoolAttr_t * attr) { - dragonError_t err = _unmap_manifest_shm(pool, attr); + dragonError_t err; + bool pool_is_destroyed = _pool_is_destroyed(pool); + + /* Free the lock since it's relying on pool resources */ + err = dragon_lock_destroy(&pool->mlock); + if (err != DRAGON_SUCCESS) + append_err_return(err, "failed to release heap manager lock"); + + err = _unmap_manifest_shm(pool, attr); if (err != DRAGON_SUCCESS) append_err_return(err, "failed to unmap manifest"); @@ -486,14 +520,21 @@ _free_pool_shm(dragonMemoryPool_t * pool, dragonMemoryPoolAttr_t * attr) if (err != DRAGON_SUCCESS) append_err_return(err, "failed to unmap data"); - err = _unlink_shm_file(attr->mname); - if (err != DRAGON_SUCCESS) - append_err_return(err, "failed to unlink manifest"); - - for (int i = 0; i < attr->n_segments + 1; i++) { - err = _unlink_shm_file(attr->names[i]); + /* If another process already called destroy, then this process + should not try to unlink the files, but should unmap + the segments. The segments still remain in memory for + all other processes until they are unmapped, regardless + of whether it was destroyed. */ + if (!pool_is_destroyed) { + err = _unlink_shm_file(attr->mname); if (err != DRAGON_SUCCESS) - append_err_return(err, "failed to unlink data file"); + append_err_return(err, "failed to unlink manifest"); + + for (int i = 0; i < attr->n_segments + 1; i++) { + err = _unlink_shm_file(attr->names[i]); + if (err != DRAGON_SUCCESS) + append_err_return(err, "failed to unlink data file"); + } } /* Because this is used during error recovery, don't use no_err_return. @@ -635,6 +676,20 @@ _attach_heap_managers(dragonMemoryPool_t * const pool) no_err_return(DRAGON_SUCCESS); } +static dragonError_t +_detach_heap_managers(dragonMemoryPool_t * pool) +{ + dragonError_t err; + dragonMemoryPoolHeap_t * heap = &(pool->heap); + for (int idx=0; idx < heap->nmgrs; idx++) { + err = dragon_heap_detach(&(heap->mgrs[idx])); + if (err != DRAGON_SUCCESS) + append_err_return(err, "failed to destroy a heap manager"); + } + + no_err_return(DRAGON_SUCCESS); +} + static dragonError_t _destroy_heap_managers(dragonMemoryPool_t * pool) { @@ -693,9 +748,6 @@ static dragonError_t _generate_manifest_record(dragonMemory_t * mem, dragonMemoryPool_t * pool, const dragonMemoryAllocationType_t type, const dragonULInt type_id) { - if (mem->bytes == 0) - err_return(DRAGON_INVALID_ARGUMENT, "You cannot generate a manifest record with zero size. There is an internal error."); - /* generate a record and put it in the manifest */ mem->mfst_record.offset = (dragonULInt)((char *)mem->local_dptr - (char *)pool->local_dptr); mem->mfst_record.size = (dragonULInt)mem->bytes; @@ -1194,11 +1246,6 @@ dragon_memory_pool_destroy(dragonMemoryPoolDescr_t * pool_descr) if (err != DRAGON_SUCCESS) append_err_return(err, "cannot construct pool attributes from pool"); - /* Free the lock since it's relying on pool resources */ - err = dragon_lock_destroy(&pool->mlock); - if (err != DRAGON_SUCCESS) - append_err_return(err, "failed to release heap manager lock"); - err = _destroy_heap_managers(pool); if (err != DRAGON_SUCCESS) append_err_return(err, "failed to destroy the heap manager"); @@ -1468,7 +1515,7 @@ dragon_memory_pool_serialize(dragonMemoryPoolSerial_t * pool_ser, const dragonMe dragonError_t dragon_memory_pool_attach(dragonMemoryPoolDescr_t * pool_descr, const dragonMemoryPoolSerial_t * pool_ser) { - bool local_pool; + bool local_pool = true; /* Check args are not null. */ if (pool_descr == NULL) @@ -1509,12 +1556,9 @@ dragon_memory_pool_attach(dragonMemoryPoolDescr_t * pool_descr, const dragonMemo if (pool == NULL) err_return(DRAGON_INTERNAL_MALLOC_FAIL, "Could not allocate internal pool structure."); - /* If this is a non-local pool we are attaching to, then we set a flag and return. */ + /* If this is a non-local pool we are attaching to, then we set a flag. */ if (local_host_id != host_id) local_pool = false; - else - local_pool = true; - /* Grab the memory storage type */ dragonULInt mem_type = *ptr; @@ -1637,6 +1681,9 @@ dragon_memory_pool_attach_from_env(dragonMemoryPoolDescr_t * pool_descr, const c { dragonMemoryPoolSerial_t pool_ser; + if (pool_descr == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "You must provide a valid pool descriptor variable."); + char *encoded_pool_str = getenv(env_var); if (encoded_pool_str == NULL) { char err_str[200]; @@ -1650,6 +1697,38 @@ dragon_memory_pool_attach_from_env(dragonMemoryPoolDescr_t * pool_descr, const c if (err != DRAGON_SUCCESS) append_err_return(err, "failed to attach to memory pool"); + dragon_memory_pool_serial_free(&pool_ser); + + no_err_return(DRAGON_SUCCESS); +} + +/** + * @brief Attach to Default Pool + * + * Processes created by Dragon have access to a default pool, one per node. + * The default pool on a node can be used when no other managed memory pools + * are available or have been provided. This function attaches to the + * default pool on its current node. + * + * @param pool is a pointer to a pool descriptor to be initialized by this call. + * + * @return DRAGON_SUCCESS or a return code to indicate what problem occurred. + */ +dragonError_t +dragon_memory_pool_attach_default(dragonMemoryPoolDescr_t* pool) +{ + dragonError_t err; + char* pool_str; + + pool_str = getenv(DRAGON_DEFAULT_PD_VAR); + + if (pool_str == NULL) + err_return(DRAGON_INVALID_OPERATION, "Called dragon_get_default_pool with no default pool set in environment."); + + err = dragon_memory_pool_attach_from_env(pool, pool_str); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not attach to default memory pool."); + no_err_return(DRAGON_SUCCESS); } @@ -1683,15 +1762,19 @@ dragon_memory_pool_detach(dragonMemoryPoolDescr_t * pool_descr) /* If this is a non-local pool, then there is less to do */ if (pool->local_dptr != NULL) { - /* Get pool attributes to free them */ dragonMemoryPoolAttr_t attrs; - _obtain_manifest_lock(pool); + _maybe_obtain_manifest_lock(pool); + err = _attrs_from_header(pool, &attrs); - _release_manifest_lock(pool); + _maybe_release_manifest_lock(pool); if (err != DRAGON_SUCCESS) append_err_return(err, "cannot construct pool attributes from pool"); + err = _detach_heap_managers(pool); + if (err != DRAGON_SUCCESS) + append_err_return(err, "Could not detach heap manager in pool detach."); + /* @MCB TODO: Need control flow to manage different memory types */ /* Unmap manifest and data pointers */ err = _unmap_manifest_shm(pool, &attrs); @@ -1851,9 +1934,6 @@ dragon_memory_serialize(dragonMemorySerial_t * mem_ser, const dragonMemoryDescr_ subsequent clones. */ *(dragonULInt*)ptr = mem->bytes; - if (mem->local_dptr != NULL && mem->mfst_record.size == 0) - err_return(DRAGON_INVALID_ARGUMENT, "Internal Failure: Found 0 sized memory allocation."); - no_err_return(DRAGON_SUCCESS); } @@ -2023,6 +2103,8 @@ dragon_memory_detach(dragonMemoryDescr_t * mem_descr) dragonError_t dragon_memory_alloc_blocking(dragonMemoryDescr_t * mem_descr, const dragonMemoryPoolDescr_t * pool_descr, const size_t bytes, const timespec_t* timeout) { + size_t alloc_bytes = bytes; + if (mem_descr == NULL) err_return(DRAGON_INVALID_ARGUMENT, "invalid memory descriptor"); @@ -2034,10 +2116,6 @@ dragon_memory_alloc_blocking(dragonMemoryDescr_t * mem_descr, const dragonMemory if (pool_descr == NULL) err_return(DRAGON_INVALID_ARGUMENT, "invalid pool descriptor"); - /* @MCB: Don't allow nonsensical "empty" allocations */ - if (bytes == 0) - err_return(DRAGON_INVALID_ARGUMENT, "cannot create allocation of size 0"); - dragonMemoryPool_t * pool; dragonError_t err = _pool_from_descr(pool_descr, &pool); if (err != DRAGON_SUCCESS) @@ -2056,7 +2134,15 @@ dragon_memory_alloc_blocking(dragonMemoryDescr_t * mem_descr, const dragonMemory err_return(DRAGON_INTERNAL_MALLOC_FAIL, "cannot allocate new memory object"); } - err = dragon_heap_malloc_blocking(&pool->heap.mgrs[0], bytes, &mem->local_dptr, timeout); + // A zero byte allocation is needed in channels when attributes are to be sent + // and potentially other entities that require a shared memory descriptor when + // there is no real allocation to make. So don't reject bytes == 0. + if (bytes == 0) + // To avoid special case code for this everywhere (cloning, freeing, etc.) + // we will make a 1 byte allocation, but say that it is zero bytes. + alloc_bytes = 1; + + err = dragon_heap_malloc_blocking(&pool->heap.mgrs[0], alloc_bytes, &mem->local_dptr, timeout); if (err != DRAGON_SUCCESS) /* Don't use append_err_return. In hot path */ return err; @@ -2154,6 +2240,8 @@ dragonError_t dragon_memory_alloc_type_blocking(dragonMemoryDescr_t * mem_descr, const dragonMemoryPoolDescr_t * pool_descr, const size_t bytes, const dragonMemoryAllocationType_t type, const dragonULInt type_id, const timespec_t* timeout) { + size_t alloc_bytes = bytes; + if (mem_descr == NULL) err_return(DRAGON_INVALID_ARGUMENT, "invalid memory descriptor"); @@ -2189,7 +2277,15 @@ dragon_memory_alloc_type_blocking(dragonMemoryDescr_t * mem_descr, const dragonM Release the lock */ - err = dragon_heap_malloc_blocking(&pool->heap.mgrs[0], bytes, &mem->local_dptr, timeout); + // A zero byte allocation is needed in channels when attributes are to be sent + // and potentially other entities that require a shared memory descriptor when + // there is no real allocation to make. So don't reject bytes == 0. + if (bytes == 0) + // To avoid special case code for this everywhere (cloning, freeing, etc.) + // we will make a 1 byte allocation, but say that it is zero bytes. + alloc_bytes = 1; + + err = dragon_heap_malloc_blocking(&pool->heap.mgrs[0], alloc_bytes, &mem->local_dptr, timeout); if (err != DRAGON_SUCCESS) { _release_manifest_lock(pool); free(mem); @@ -2611,6 +2707,62 @@ dragon_memory_pool_get_type_allocations(const dragonMemoryPoolDescr_t * pool_des no_err_return(DRAGON_SUCCESS); } +/** + * @brief Get the base address for a memory pool + * + * This function returns the base address for a memory pool. + * + * @param pool_descr is a pointer to the descriptor to the memory pool. + * + * @param base_ptr is a pointer to the returned base address. + * + * @returns DRAGON_SUCCESS or another dragonError_t return code. +*/ + +dragonError_t +dragon_memory_pool_get_pointer(const dragonMemoryPoolDescr_t * pool_descr, void **base_ptr) +{ + dragonMemoryPool_t * pool = NULL; + dragonError_t err = _pool_from_descr(pool_descr, &pool); + if (err != DRAGON_SUCCESS) + append_err_return(err, "could not retrieve pool from descriptor"); + + if (pool->local_dptr == NULL) + err_return(DRAGON_MEMORY_OPERATION_ATTEMPT_ON_NONLOCAL_POOL, "You cannot get a base pointer for a non-local pool."); + + *base_ptr = pool->local_dptr; + + no_err_return(DRAGON_SUCCESS); +} + +/** + * @brief Get the total size of a memory pool + * + * This function returns the total size of a memory pool. + * + * @param pool_descr is a pointer to the descriptor to the memory pool. + * + * @param size is a pointer to the returned size of the pool. + * + * @returns DRAGON_SUCCESS or another dragonError_t return code. +*/ + +dragonError_t +dragon_memory_pool_get_size(const dragonMemoryPoolDescr_t * pool_descr, size_t *size) +{ + dragonMemoryPool_t * pool = NULL; + dragonError_t err = _pool_from_descr(pool_descr, &pool); + if (err != DRAGON_SUCCESS) + append_err_return(err, "could not retrieve pool from descriptor"); + + if (pool->local_dptr == NULL) + err_return(DRAGON_MEMORY_OPERATION_ATTEMPT_ON_NONLOCAL_POOL, "You cannot get the size of a non-local pool."); + + *size = *pool->header.total_data_size; + + no_err_return(DRAGON_SUCCESS); +} + /** * @brief Get a pointer into a memory descriptor * @@ -2835,7 +2987,7 @@ dragon_memory_descr_clone(dragonMemoryDescr_t * newmem_descr, const dragonMemory if (err != DRAGON_SUCCESS) append_err_return(err, "invalid memory descriptor"); - if (offset >= mem->bytes) + if (offset > mem->bytes) err_return(DRAGON_INVALID_ARGUMENT, "offset too big for allocation"); if (custom_length != NULL) { diff --git a/src/lib/pals.c b/src/lib/pals.c index 65718ed..d7293ef 100644 --- a/src/lib/pals.c +++ b/src/lib/pals.c @@ -9,6 +9,9 @@ extern dragonRecvJobParams_t pmod_mparams; static void *lib_pals_handle = NULL; +static int ptrs_set = 0; +static int inside_vanilla_pals = 0; + pals_rc_t (*fn_pals_init)(pals_state_t *state); pals_rc_t (*fn_pals_init2)(pals_state_t **state); @@ -54,34 +57,62 @@ void set_pals_function_pointers() fn_pals_errmsg = dlsym(lib_pals_handle, "pals_errmsg"); } +int get_pals_context() { + return inside_vanilla_pals; +} + +void set_pals_context() { + inside_vanilla_pals = 1; +} + +void unset_pals_context() { + inside_vanilla_pals = 0; +} + int check_calling_context() { - if (getenv("_DRAGON_PALS_ENABLED")) { + if (getenv("_DRAGON_PALS_ENABLED") && !(get_pals_context())) { return 1; } else { return 0; } } + + pals_rc_t pals_init(pals_state_t *state) { + // PALS init and finalize functions will always only need to return + // the values PALS knows to be true. Thus, we need to make sure any + // PALS functions we wrap know to send back unmodified return values, + // ie: only use the results from direct calls to our PALS function pointers. set_pals_function_pointers(); - // no error checking, just pass rc through to caller - return fn_pals_init(state); + set_pals_context(); + + pals_rc_t err = fn_pals_init(state); + unset_pals_context(); + return err; } -// TODO: pals_init2 will always be defined, so how can PMI check if it's NULL? +//// TODO: pals_init2 will always be defined, so how can PMI check if it's NULL? pals_rc_t pals_init2(pals_state_t **state) { set_pals_function_pointers(); + + set_pals_context(); // no error checking, just pass rc through to caller - return fn_pals_init2(state); + pals_rc_t err = fn_pals_init2(state); + unset_pals_context(); + return err; } pals_rc_t pals_fini(pals_state_t *state) { + set_pals_context(); // no error checking, just pass rc through to caller - return fn_pals_fini(state); + pals_rc_t err = fn_pals_fini(state); + unset_pals_context(); + return err; } pals_rc_t pals_get_peidx(pals_state_t *state, int *peidx) diff --git a/src/lib/shared_lock.c b/src/lib/shared_lock.c index dafbadb..7c2b59e 100644 --- a/src/lib/shared_lock.c +++ b/src/lib/shared_lock.c @@ -10,6 +10,9 @@ #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) +#define LOCK_DESTROYED 0xDEADDEADDEADDEAD +#define LOCK_INITD 0x0101010101010101 + /* @MCB: Lock sizes. First uint64_t size is to avoid compiler-specific enum sizes. Second uint64_t is to avoid architecture-specific size_t sizes. @@ -350,6 +353,27 @@ dragon_lock_state(dragonLock_t * lock, dragonLockState_t * state) } } +bool +dragon_lock_is_valid(dragonLock_t * lock) +{ + if (lock == NULL) + return false; + + switch(lock->kind) { + case DRAGON_LOCK_FIFO: + return dragon_fifo_lock_is_valid(lock->ptr.fifo); + + case DRAGON_LOCK_FIFO_LITE: + return dragon_fifolite_lock_is_valid(lock->ptr.fifo_lite); + + case DRAGON_LOCK_GREEDY: + return dragon_greedy_lock_is_valid(lock->ptr.greedy); + + default: + return false; + } +} + /* map a new dragonLock_t to a block of memory and optionally initialize it */ dragonError_t dragon_fifo_lock_init(dragonFIFOLock_t * dlock, void * ptr) @@ -363,10 +387,10 @@ dragon_fifo_lock_init(dragonFIFOLock_t * dlock, void * ptr) dragonError_t derr; derr = dragon_fifo_lock_attach(dlock, ptr); - if (derr != DRAGON_SUCCESS) - append_err_return(derr,""); + if (derr != DRAGON_OBJECT_DESTROYED) + append_err_return(DRAGON_LOCK_ALREADY_INITD,""); - *(dlock->initd) = 1UL; // Make note for destroy + *(dlock->initd) = LOCK_INITD; // Make note for destroy *(dlock->lock_size) = dragonFIFOLockSize; *(dlock->now_serving) = 0UL; @@ -394,11 +418,10 @@ dragon_fifolite_lock_init(dragonFIFOLiteLock_t * dlock, void * ptr) dragonError_t derr; derr = dragon_fifolite_lock_attach(dlock, ptr); - if (derr != DRAGON_SUCCESS) - append_err_return(derr,""); - - *(dlock->initd) = 1UL; // Make note for destroy + if (derr != DRAGON_OBJECT_DESTROYED) + append_err_return(DRAGON_LOCK_ALREADY_INITD,""); + *(dlock->initd) = LOCK_INITD; // Make note for destroy *(dlock->lock_size) = dragonFIFOLiteLockSize; *(dlock->now_serving) = 0UL; *(dlock->ticket_counter) = 0UL; @@ -415,14 +438,14 @@ dragon_fifo_lock_attach(dragonFIFOLock_t * dlock, void * ptr) err_return(DRAGON_INVALID_ARGUMENT,""); ptr += sizeof(uint64_t); - dlock->lock_size = (uint64_t *)ptr; - ptr += sizeof(uint64_t); + ptr += sizeof(uint64_t); dlock->initd = (dragonLockType_t *)ptr; - ptr += sizeof(dragonLockType_t); + ptr += sizeof(dragonLockType_t); dlock->now_serving = (dragonLockType_t *)ptr; + ptr += DRAGON_LOCK_CL_PADDING; dlock->ticket_counter = (dragonLockType_t *)ptr; ptr += DRAGON_LOCK_CL_PADDING; @@ -466,6 +489,9 @@ dragon_fifo_lock_attach(dragonFIFOLock_t * dlock, void * ptr) append_err_return(derr,""); } + if (*dlock->initd != LOCK_INITD) + err_return(DRAGON_OBJECT_DESTROYED, "The Dragon object was already destroyed and cannot be attached."); + no_err_return(DRAGON_SUCCESS); } @@ -491,6 +517,9 @@ dragon_fifolite_lock_attach(dragonFIFOLiteLock_t * dlock, void * ptr) dlock->ticket_counter = (dragonLockType_t *)ptr; ptr += DRAGON_LOCK_CL_PADDING; + if (*dlock->initd != LOCK_INITD) + err_return(DRAGON_OBJECT_DESTROYED, "The Dragon object was already destroyed and cannot be attached."); + no_err_return(DRAGON_SUCCESS); } @@ -511,7 +540,7 @@ dragon_fifo_lock_detach(dragonFIFOLock_t * dlock) free(dlock->nodes_ticket_counter); dragonError_t derr = dragon_fifolite_lock_destroy(&dlock->thr_lock); - if (derr != DRAGON_SUCCESS) + if (derr != DRAGON_SUCCESS && derr != DRAGON_OBJECT_DESTROYED) append_err_return(derr,""); if (dlock->thr_lock_dptr != NULL) free(dlock->thr_lock_dptr); @@ -544,11 +573,10 @@ dragon_greedy_lock_init(dragonGreedyLock_t * dlock, void * ptr) *(uint64_t*)ptr = (uint64_t)DRAGON_LOCK_GREEDY; dragonError_t derr; derr = dragon_greedy_lock_attach(dlock, ptr); - if (derr != DRAGON_SUCCESS) - append_err_return(derr,""); - + if (derr != DRAGON_OBJECT_DESTROYED) + append_err_return(DRAGON_LOCK_ALREADY_INITD,""); - *(dlock->initd) = 1UL; // Make note for destroy + *(dlock->initd) = LOCK_INITD; *(dlock->lock_size) = dragonGreedyLockSize; @@ -567,19 +595,21 @@ dragon_greedy_lock_attach(dragonGreedyLock_t * dlock, void * ptr) { if (dlock == NULL) err_return(DRAGON_INVALID_ARGUMENT,""); - if (ptr == NULL) err_return(DRAGON_INVALID_ARGUMENT,""); ptr += sizeof(uint64_t); - dlock->lock_size = (uint64_t *)ptr; + ptr += sizeof(uint64_t); dlock->initd = (dragonLockType_t *)ptr; - ptr += sizeof(dragonLockType_t); + ptr += sizeof(dragonLockType_t); dlock->mutex = (pthread_mutex_t *)ptr; + if (*dlock->initd != LOCK_INITD) + err_return(DRAGON_OBJECT_DESTROYED, "The Dragon object was already destroyed and cannot be attached."); + no_err_return(DRAGON_SUCCESS); } @@ -603,15 +633,12 @@ dragon_fifo_lock_destroy(dragonFIFOLock_t * dlock) if (dlock == NULL) err_return(DRAGON_INVALID_ARGUMENT,""); - if (dlock->initd == NULL) - err_return(DRAGON_LOCK_NOT_INITD,""); - - dragonLockType_t cur_initd, already_initd; - already_initd = 0UL; - cur_initd = atomic_exchange_explicit(dlock->initd, already_initd, + dragonLockType_t cur_initd, destroy; + destroy = LOCK_DESTROYED; + cur_initd = atomic_exchange_explicit(dlock->initd, destroy, DRAGON_LOCK_MEM_ORDER); - if (cur_initd == 0UL) + if (cur_initd != LOCK_INITD && cur_initd != LOCK_DESTROYED) err_return(DRAGON_LOCK_NOT_INITD,""); return dragon_fifo_lock_detach(dlock); @@ -623,15 +650,12 @@ dragon_fifolite_lock_destroy(dragonFIFOLiteLock_t * dlock) if (dlock == NULL) err_return(DRAGON_INVALID_ARGUMENT,""); - if (dlock->initd == NULL) - err_return(DRAGON_LOCK_NOT_INITD,""); - - dragonLockType_t cur_initd, already_initd; - already_initd = 0UL; - cur_initd = atomic_exchange_explicit(dlock->initd, already_initd, + dragonLockType_t cur_initd, destroy; + destroy = LOCK_DESTROYED; + cur_initd = atomic_exchange_explicit(dlock->initd, destroy, DRAGON_LOCK_MEM_ORDER); - if (cur_initd == 0UL) + if (cur_initd != LOCK_INITD && cur_initd != LOCK_DESTROYED) err_return(DRAGON_LOCK_NOT_INITD,""); return dragon_fifolite_lock_detach(dlock); @@ -646,12 +670,12 @@ dragon_greedy_lock_destroy(dragonGreedyLock_t * dlock) if (dlock->initd == NULL) err_return(DRAGON_LOCK_NOT_INITD,""); - dragonLockType_t cur_initd, already_initd; - already_initd = 0UL; - cur_initd = atomic_exchange_explicit(dlock->initd, already_initd, + dragonLockType_t cur_initd, destroy; + destroy = LOCK_DESTROYED; + cur_initd = atomic_exchange_explicit(dlock->initd, destroy, DRAGON_LOCK_MEM_ORDER); - if (cur_initd == 0UL) + if (cur_initd != LOCK_INITD && cur_initd != LOCK_DESTROYED) err_return(DRAGON_LOCK_NOT_INITD,""); if (pthread_mutex_destroy(dlock->mutex) != 0) @@ -667,6 +691,12 @@ dragon_fifo_lock(dragonFIFOLock_t * dlock) if (dlock == NULL) err_return(DRAGON_INVALID_ARGUMENT,""); + if (dlock->initd == NULL) + err_return(DRAGON_LOCK_NOT_INITD,""); + + if (*dlock->initd != LOCK_INITD) + err_return(DRAGON_OBJECT_DESTROYED, ""); + /* get the local thread lock */ dragonError_t derr = dragon_fifolite_lock(&dlock->thr_lock); if (derr != DRAGON_SUCCESS) @@ -710,6 +740,12 @@ dragon_fifolite_lock(dragonFIFOLiteLock_t * dlock) if (dlock == NULL) err_return(DRAGON_INVALID_ARGUMENT,""); + if (dlock->initd == NULL) + err_return(DRAGON_LOCK_NOT_INITD,""); + + if (*dlock->initd != LOCK_INITD) + err_return(DRAGON_OBJECT_DESTROYED, ""); + /* now contend for the main lock */ dragonLockType_t my_ticket; my_ticket = atomic_fetch_add_explicit(dlock->ticket_counter, 1UL, @@ -735,6 +771,12 @@ dragon_greedy_lock(dragonGreedyLock_t * dlock) if (dlock == NULL) err_return(DRAGON_INVALID_ARGUMENT,""); + if (dlock->initd == NULL) + err_return(DRAGON_LOCK_NOT_INITD,""); + + if (*dlock->initd != LOCK_INITD) + err_return(DRAGON_OBJECT_DESTROYED, ""); + int ierr = pthread_mutex_lock(dlock->mutex); if (unlikely(ierr != 0)) err_return(DRAGON_LOCK_PTHREAD_MUTEX_LOCK,""); @@ -752,6 +794,12 @@ dragon_fifo_try_lock(dragonFIFOLock_t * dlock, int *locked) if (locked == NULL) err_return(DRAGON_INVALID_ARGUMENT,""); + if (dlock->initd == NULL) + err_return(DRAGON_LOCK_NOT_INITD,""); + + if (*dlock->initd != LOCK_INITD) + err_return(DRAGON_OBJECT_DESTROYED, ""); + *locked = 0; /* try to get the local thread lock */ @@ -823,6 +871,12 @@ dragon_fifolite_try_lock(dragonFIFOLiteLock_t * dlock, int *locked) if (locked == NULL) err_return(DRAGON_INVALID_ARGUMENT,""); + if (dlock->initd == NULL) + err_return(DRAGON_LOCK_NOT_INITD,""); + + if (*dlock->initd != LOCK_INITD) + err_return(DRAGON_OBJECT_DESTROYED, ""); + *locked = 0; dragonLockType_t cticket, now_serve; @@ -861,6 +915,12 @@ dragon_greedy_try_lock(dragonGreedyLock_t * dlock, int *locked) if (locked == NULL) err_return(DRAGON_INVALID_ARGUMENT,""); + if (dlock->initd == NULL) + err_return(DRAGON_LOCK_NOT_INITD,""); + + if (*dlock->initd != LOCK_INITD) + err_return(DRAGON_OBJECT_DESTROYED, ""); + int ierr = pthread_mutex_trylock(dlock->mutex); if (ierr == EBUSY) { @@ -961,3 +1021,39 @@ dragon_greedy_lock_state(dragonGreedyLock_t* dlock, dragonLockState_t* state) no_err_return(DRAGON_SUCCESS); } + +bool +dragon_fifolite_lock_is_valid(dragonFIFOLiteLock_t * dlock) +{ + if (dlock == NULL) + return false; + + if (dlock->initd == NULL) + return false; + + return *dlock->initd == LOCK_INITD; +} + +bool +dragon_fifo_lock_is_valid(dragonFIFOLock_t * dlock) +{ + if (dlock == NULL) + return false; + + if (dlock->initd == NULL) + return false; + + return *dlock->initd == LOCK_INITD; +} + +bool +dragon_greedy_lock_is_valid(dragonGreedyLock_t* dlock) +{ + if (dlock == NULL) + return false; + + if (dlock->initd == NULL) + return false; + + return *dlock->initd == LOCK_INITD; +} diff --git a/src/lib/shared_lock.h b/src/lib/shared_lock.h index ca5cc6b..b314c94 100644 --- a/src/lib/shared_lock.h +++ b/src/lib/shared_lock.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -29,7 +30,7 @@ typedef struct dragonFIFOLiteLock_st { } dragonFIFOLiteLock_t; typedef struct dragonFIFOLock_st { - dragonFIFOLiteLock_t thr_lock; // this type needs my_node (local node), which we need to protect per process + dragonFIFOLiteLock_t thr_lock; // this type needs my_node (local node), which we need to protect per process // if the same lock struct is used across threads. for pure FIFO each thread // should use its own lock struct void * thr_lock_dptr; @@ -98,6 +99,9 @@ dragon_unlock(dragonLock_t * lock); dragonError_t dragon_lock_state(dragonLock_t * lock, dragonLockState_t * state); +bool +dragon_lock_is_valid(dragonLock_t * lock); + /* ---------------------------------------- Begin direct API calls ---------------------------------------- */ @@ -174,6 +178,15 @@ dragon_fifo_lock_state(dragonFIFOLock_t * dlock, dragonLockState_t * state); dragonError_t dragon_greedy_lock_state(dragonGreedyLock_t * dlock, dragonLockState_t * state); +bool +dragon_fifolite_lock_is_valid(dragonFIFOLiteLock_t * dlock); + +bool +dragon_fifo_lock_is_valid(dragonFIFOLock_t * dlock); + +bool +dragon_greedy_lock_is_valid(dragonGreedyLock_t* dlock); + #ifdef __cplusplus } #endif diff --git a/src/lib/utils.c b/src/lib/utils.c index 3cec1f2..d4630b7 100644 --- a/src/lib/utils.c +++ b/src/lib/utils.c @@ -64,8 +64,8 @@ char * _errstr_with_code(char * str, int code) { char * new_str = malloc(sizeof(char) * (strnlen(str, DRAGON_MAX_ERRSTR_REC_LEN) + - snprintf(NULL, 0, " %i", code) + 1)); - sprintf(new_str, "%s %i", str, code); + snprintf(NULL, 0, " %s", dragon_get_rc_string(code)) + 1)); + sprintf(new_str, "%s %s", str, dragon_get_rc_string(code)); return new_str; } @@ -122,25 +122,13 @@ _sanitize_id(char *boot_id) int _get_dec_from_hex(char hex) { - if (isdigit(hex)) { - return atoi(&hex); - } else{ - switch(hex) { - case('a'): - return 10; - case('b'): - return 11; - case('c'): - return 12; - case('d'): - return 13; - case('e'): - return 14; - case('f'): - return 15; - } - } - return 0; + /* This only works for lowercase hex letters and digits. Don't use + for anything else! */ + + if (isdigit(hex)) + return hex - '0'; + else + return hex - 'a'; } dragonError_t @@ -148,15 +136,16 @@ _hex_to_dec(char *hex, uint64_t *dec) { *dec = 0UL; int i, len = strlen(hex); - int term = len - 16; + int start = len - 16; - if (term < 0) + if (start < 0) err_return(DRAGON_INVALID_ARGUMENT, "Hex string less than 8 bytes"); // Read the last 16 digits and convert - for (i = len-1; i >= term; i--) { - *dec += (uint64_t) _get_dec_from_hex(hex[i]) * (uint64_t) pow(16.0, len-1-i); + for (i = start; i < len; i++) { + *dec += *dec * 16 + _get_dec_from_hex(hex[i]); } + no_err_return(DRAGON_SUCCESS); } @@ -432,26 +421,26 @@ dragon_timespec_deadline(const timespec_t* timer, timespec_t* deadline) * Check whether the current time has past the end of a timer and compute remaining time. * * This function no_err_return(DRAGON_SUCCESS) if no timeout has occurred and computes the - * remaining time. If end_time is in the past, then this function returns DRAGON_TIMEOUT. + * remaining time. If deadline is in the past, then this function returns DRAGON_TIMEOUT. * - * @param end_time A pointer to a timespec structure that holds the time when the timer + * @param deadline A pointer to a timespec structure that holds the time when the timer * will expire. - * @param remaining_timeout The computed remaining time for the given end_time. + * @param remaining_timeout The computed remaining time for the given deadline. * @returns DRAGON_SUCCESS or DRAGON_TIMEOUT or an undetermined error code. **********************************************************************************/ dragonError_t -dragon_timespec_remaining(const timespec_t * end_time, timespec_t * remaining_timeout) +dragon_timespec_remaining(const timespec_t * deadline, timespec_t * remaining_timeout) { timespec_t now_time; - if (end_time == NULL) - err_return(DRAGON_INVALID_ARGUMENT, "Cannot pass NULL as end_time argument."); + if (deadline == NULL) + err_return(DRAGON_INVALID_ARGUMENT, "Cannot pass NULL as deadline argument."); if (remaining_timeout == NULL) err_return(DRAGON_INVALID_ARGUMENT, "Cannot pass NULL as remaining_timeout argument."); - if (end_time->tv_nsec == 0 && end_time->tv_sec == 0) { + if (deadline->tv_nsec == 0 && deadline->tv_sec == 0) { /* A zero timeout corresponds to a try-once attempt */ remaining_timeout->tv_nsec = 0; remaining_timeout->tv_sec = 0; @@ -460,13 +449,13 @@ dragon_timespec_remaining(const timespec_t * end_time, timespec_t * remaining_ti clock_gettime(CLOCK_MONOTONIC, &now_time); - if (dragon_timespec_le(end_time, &now_time)) { + if (dragon_timespec_le(deadline, &now_time)) { remaining_timeout->tv_sec = 0; remaining_timeout->tv_nsec = 0; no_err_return(DRAGON_TIMEOUT); } - dragonError_t err = dragon_timespec_diff(remaining_timeout, end_time, &now_time); + dragonError_t err = dragon_timespec_diff(remaining_timeout, deadline, &now_time); if (err != DRAGON_SUCCESS) append_err_return(err, "This shouldn't happen."); @@ -583,4 +572,4 @@ dragon_hash_ulint(dragonULInt x) z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9; z = (z ^ (z >> 27)) * 0x94d049bb133111eb; return z ^ (z >> 31); -} \ No newline at end of file +} diff --git a/src/lmod/dragon-dev.lua b/src/lmod/dragon-dev.lua index c249969..4777674 100644 --- a/src/lmod/dragon-dev.lua +++ b/src/lmod/dragon-dev.lua @@ -36,7 +36,7 @@ local DRAGON_BASE_DIR = capture(base_dir):gsub("\n$", "") -- environment modifications -- -setenv("DRAGON_VERSION", "0.61") +setenv("DRAGON_VERSION", "0.8") -- get project dir setenv("DRAGON_BASE_DIR", DRAGON_BASE_DIR) @@ -50,6 +50,8 @@ setenv("DRAGON_BASE_DIR", DRAGON_BASE_DIR) load("craype-x86-rome") load("PrgEnv-gnu"); +-- it's possible (e.g. on pinoak) to load PrgEnv-gnu but not get the gcc module +load("gcc") load("cray-python") diff --git a/src/lmod/dragon.lua b/src/lmod/dragon.lua index 5158979..9cad828 100644 --- a/src/lmod/dragon.lua +++ b/src/lmod/dragon.lua @@ -32,7 +32,7 @@ local DRAGON_BASE_DIR = capture(base_dir):gsub("\n$", "") -- environment modifications -- -setenv("DRAGON_VERSION", "0.61") +setenv("DRAGON_VERSION", "0.8") setenv("DRAGON_BASE_DIR", DRAGON_BASE_DIR) diff --git a/src/modulefiles/dragon b/src/modulefiles/dragon index 75b6dff..cc95637 100644 --- a/src/modulefiles/dragon +++ b/src/modulefiles/dragon @@ -4,7 +4,7 @@ # Module Dragon # -setenv DRAGON_VERSION 0.61 +setenv DRAGON_VERSION 0.8 setenv DRAGON_BASE_DIR [file dirname [ file dirname $ModulesCurrentModulefile ] ] setenv DRAGON_INCLUDE_DIR $env(DRAGON_BASE_DIR)/include setenv DRAGON_LIB_DIR $env(DRAGON_BASE_DIR)/lib diff --git a/src/modulefiles/dragon-dev b/src/modulefiles/dragon-dev index 8a97868..70c11aa 100644 --- a/src/modulefiles/dragon-dev +++ b/src/modulefiles/dragon-dev @@ -4,7 +4,7 @@ # Module Dragon # -setenv DRAGON_VERSION 0.61 +setenv DRAGON_VERSION 0.8 setenv DRAGON_BASE_DIR [file dirname [ file dirname $ModulesCurrentModulefile ] ] if { [ is-loaded PrgEnv-cray ] } { @@ -20,6 +20,10 @@ if { ! [ is-loaded PrgEnv-gnu ] && [ module avail PrgEnv-gnu ] != "" } { module load PrgEnv-gnu module load craype-x86-rome } +# it's possible (e.g. on pinoak) to load PrgEnv-gnu but not get the gcc module +if { ! [ is-loaded gcc ] && [ module avail gcc ] != "" } { + module load gcc +} if { ! [ is-loaded cray-python ] && [ module avail cray-python ] != ""} { puts stderr "+----------------------+" puts stderr "Auto-loading cray-python" diff --git a/src/pkg/CHANGELOG.md b/src/pkg/CHANGELOG.md new file mode 100644 index 0000000..57def1c --- /dev/null +++ b/src/pkg/CHANGELOG.md @@ -0,0 +1,65 @@ +# Changelog + +## [0.8] - 2024-02-26 + +### Changed + +### Added + +Libfabric support for hsta and multi-NIC support [#620, #614, #594] by nick-radcliffe was merged Mar 7, 2024, Feb 22, 2024, Feb 15, 2024 + +PyTorch Dataloader process placement patch and example [#610, #608, #601, #599] by veena-venkata-ghorakavi and wahlc was merged Feb 9, 2024, Feb 6, 2024, Jan 4, 2024 + +Streamline node updates [#607] by eric-cozzi was merged Feb 8, 2024 + +Build multiple python versions of dragon [#600] by mohammad-hadi was merged Jan 4, 2024 + +Add Overlay Network docs [#595] by eric-cozzi was merged Jan 2, 2024 + +Startup time improvement [#586] by eric-cozzi was closed on Nov 29, 2023 + +Channels file-like interface [#585] by kent-lee was merged 10 days ago + +### Removed + +Remove devcontainer code-server from the dragon-cleanup script [#606] by nicholas-hill was merged Feb 7, 2024 + +Remove compressed json wrapper [#596] by eric-cozzi was merged Jan 4, 2024 + +Remove signal handling and update Slurm srun command [#584] by eric-cozzi was merged on Nov 27, 2023 + +### Fixed + +Fix PMI issues on PBSPro+PALS systems [#617] by nicholas-hill was merged Feb 23, 2024 + +Add typecast to parameter args if env. var. is empty string [#615] by nicholas-hill was merged Feb 16, 2024 + +Fix queue `__del__` error from pi demo [#605] by yian-chen was merged Jan 29, 2024 + +Add checks that only the Primary Launcher Backend talks to GS [#604] by eric-cozzi was merged Jan 30, 2024 • Approved + +Fix Pool Detach/Destroy [#598] by kent-lee was merged Jan 4, 2024 + +Fix mp unittests test.support import compatibility with Python 3.10 and 3.11 [#597] by wahlc was merged Jan 4, 2024 + +Fix multinode sequence diagrams [#592] by eric-cozzi was merged on Dec 8, 2023 + +Catch executables with Path-like paths and ensure they are strings [#591] by wahlc was merged on Dec 8, 2023 + +Raise abnormal exit if head process start fails [#590] by eric-cozzi was merged on Dec 11, 2023 + +Make device_list in AcceleratorDescriptor serializable for messages [#589] by nicholas-hill was merged on Dec 4, 2023 + +Log text of SHFwdOutput messages on both backend and frontend [#588] by eric-cozzi was merged on Dec 11, 2023 + +Remove find_accelerators from `NodeDescriptor.__init__` [#587] by nicholas-hill was merged on Nov 30, 2023 + +Fix ImportError for mp unittests when tested with Python 3.10+ [#583] by wahlc was merged on Dec 8, 2023 + +Fix network-config-helper to wait for signal to exit [#582] by eric-cozzi was merged on Nov 22, 2023 + +Launcher ON should use a different port [#581] by eric-cozzi was merged on Nov 22, 2023 + +Fix attrs version constraint [#579] by eric-cozzi was merged on Nov 16, 2023 + +Gateway per msg type [#578] by nick-radcliffe was merged on Nov 21, 2023 diff --git a/src/pkg/INSTALL.md b/src/pkg/INSTALL.md index 310ceb7..120eb12 100644 --- a/src/pkg/INSTALL.md +++ b/src/pkg/INSTALL.md @@ -4,7 +4,7 @@ shared objects are separated out from the Dragon Python wheel file. This is to Dragon runtime environment from other languages, such as Fortran/C/C++. Before you can run programs using Dragon, you must set up the run-time for your -environment. You must have Python 3.9 installed and it must be in your path +environment. You must have Python 3.9, 3.10, or 3.11 installed and it must be in your path somewhere. A common choice is to use a Python virtual environment, which can be initialized from a base Python 3.9+ with: @@ -17,7 +17,7 @@ are relative to the directory that contains the README.md. The `dragon-*.whl` file must be pip3 installed once for your environment. - pip3 install --force-reinstall dragon-0.61-cp39-cp39-linux_x86_64.whl + pip3 install --force-reinstall dragon-0.8-*.whl Check and possibly update that `$PATH` is has the location of pip installed diff --git a/src/pkg/Makefile b/src/pkg/Makefile index fba6cfb..e2ab543 100644 --- a/src/pkg/Makefile +++ b/src/pkg/Makefile @@ -5,7 +5,9 @@ INSTALL_MODE ?= 755 PRODUCTS = INSTALL.md \ LICENSE.md \ README.md \ - RELEASE_NOTES.md + RELEASE_NOTES.md \ + CHANGELOG.md + build: @echo "Nothing to do for now" diff --git a/src/pkg/README.md b/src/pkg/README.md index 577035a..02ec8de 100644 --- a/src/pkg/README.md +++ b/src/pkg/README.md @@ -14,8 +14,10 @@ Before Running a Program ------------------------ Before you can run programs using Dragon, you must set up the run-time for your -environment. You must have Python 3.9 installed and it must be in your path -somewhere. +environment. You must have a supported Python installed and it must be in your path +somewhere. The python version must correspond to the Dragon Python whl file you +are installing eg a cp39 whl file corresponds to Python 3.9 and a cp310 whl file +corresponds to Python 3.10. The untarred distribution file contains several subdirectories directories including the following. All provided commands are relative to the directory @@ -24,7 +26,7 @@ that contains this README.md. * The dragon-*.whl file must be pip3 installed once for your environment. - pip3 install --force-reinstall dragon-0.61-cp39-cp39-linux_x86_64.whl + pip3 install --force-reinstall dragon-0.8-*.whl * Check and possibly update that `$PATH` has the location of pip installed console scripts, such as ~/.local/bin @@ -34,7 +36,7 @@ that contains this README.md. * modulefiles - This contains module files that are needed when using Dragon. You must set up the environment by loading the dragon module as follows. - module use [/path to dragon-0.61]/modulefiles + module use [/path to dragon-0.8]/modulefiles module load dragon If you intend to use Dragon on your own Linux VM or an image that you @@ -52,8 +54,10 @@ that contains this README.md. * examples - This directory provides a few demonstration programs that provide some working examples of using multiprocessing with Dragon and of the Dragon API itself. Also under this directory are the standard Python multiprocessing unit - tests packaged for easier use with Dragon. There is a README.md in the `examples` - directory with more information about these demonstration programs. + tests packaged for easier use with Dragon (these have been developed for + Python 3.9 but are still in progress for Python 3.10 and 3.11.) There is a + README.md in the `examples` directory with more information about these + demonstration programs. * dragon_unittests - This directory contains a selection of Dragon-specific unit tests used in the development process. The tests are included as validation @@ -69,7 +73,7 @@ that contains this README.md. of Dragon. After doing the `pip3 install` and the -`module use [/path to dragon-0.61]/modulefiles && module load dragon` you have +`module use [/path to dragon-0.8]/modulefiles && module load dragon` you have completed the prerequisites for running Dragon multiprocessing programs. Running a Program using Dragon diff --git a/src/pkg/RELEASE_NOTES.md b/src/pkg/RELEASE_NOTES.md index 2333a15..1113d95 100644 --- a/src/pkg/RELEASE_NOTES.md +++ b/src/pkg/RELEASE_NOTES.md @@ -1,12 +1,25 @@ -# Dragon Open Source Release -Dragon is now open source and available for download. The open source version -comes with the full implementation of Dragon using the TCP transport agent. +# Dragon 0.8 Release Summary +This package introduces new features that enhance portability, further optimize performance at scale, and increase usability with packages that rely on Python multiprocessing derivatives. Highlighted new features are: -Optional Dragon extensions that further enhance Dragon's performance in HPC -environments exist; please see the FAQ on the Dragon documentation website. +- Ability for high speed transport agent to use multiple NICs +- Use of libfabric for high speed transport RDMA operations +- Improved performance of launcher start up time for allocations of more than ~100 nodes. +- Enhanced testing pipeline for Python 3.10 and 3.11 +- Added documentation for Overlay Network and a cookbook entry for using the PyTorch native Dataloader over a Distributed Dictionary +- Fixed PMI patching for PBS/Pals, Overlay Network port conflict and exit signaling, detach/destroy of memory pools. +- Fixed numpy scaling test to be able to efficiently scale to 64+ nodes -# Dragon 0.6 and 0.61 Release Summary +# Dragon 0.7 Release Summary +This package introduces a number of key features to Dragon in addition to many bug fixes, improved robustness, and +addition/refinements to documentation. Highlighted new features are: +- Ability to support running Dragon in multinode mode on an allocation of up to 1024 nodes. +- Ability to establish policies for CPU and GPU placement when starting a Dragon Process or Dragon ProcessGroup +- Enhanced support for Conda python environments +- The Dragon GlobalServices API for Dragon process Groups now supports List and Query operations +- Documentation updates explaining how to run Dragon multinode using generic SSH launch + +# Dragon 0.6 and 0.61 Release Summary This package is the first to extend Dragon beyond support for Python multiprocessing. The key new feature is support for running collections of executables, including executables that require support for PMI (e.g., MPI). PMI @@ -32,7 +45,7 @@ addition/refinements to documentation. Highlighted new features are: - mp.Value - Significantly improved launcher stability and ctrl-c handling - Preview release of a distributed dictionary -- Documentation cookbook entries for an LLM inference service, Jupyter notebooks, and distributed dictionary +- documentation cookbook entries for an llm inference service, jupyter notebooks, and distributed dictionary Note: Dragon 0.52 was released to fix a significant cosmetic bug that when triggered made it appear Dragon did not exit cleanly. It also corrects an install issue with the distributed dictionary preview component. diff --git a/src/setup.py b/src/setup.py index f1d5099..b20fcfb 100644 --- a/src/setup.py +++ b/src/setup.py @@ -17,10 +17,17 @@ ROOTDIR = str(Path(__file__).parent) +def make_relative_rpath_args(path): + """Construct platform-appropriate RPATH to support binary + wheels that ship with libdragon.so, etc.""" + return ["-Wl,-rpath,$ORIGIN/" + path] + + DragonExtension = partial(Extension, include_dirs=[f'{ROOTDIR}/lib', f'{ROOTDIR}/include'], library_dirs=[f'{ROOTDIR}/lib'], libraries=['dragon', 'rt'], + extra_link_args=make_relative_rpath_args("lib"), ) @@ -36,6 +43,7 @@ DragonExtension("dragon.dlogging.logger", ["dragon/dlogging/pydragon_logging.pyx"]), DragonExtension("dragon.pmod", ["dragon/pydragon_pmod.pyx"]), DragonExtension("dragon.perf", ["dragon/pydragon_perf.pyx"]), + DragonExtension("dragon.fli", ["dragon/pydragon_fli.pyx"]), Extension( "dragon.launcher.pmsgqueue", ["dragon/launcher/pydragon_pmsgqueue.pyx"], @@ -58,6 +66,18 @@ def initialize_options(self): self.cythonize = 0 def run(self): + rootdir = Path(ROOTDIR) + lib_tempdir = rootdir / 'dragon' / 'lib' + try: + # In order for setuptools to include files in a wheel, those + # files must be in f'{ROOTDIR}/dragon' or a subdirectory; we + # create a temporary symlink to point at f'{ROOTDIR}/lib' to + # include 'libdragon.so' etc. in the binary wheel. + lib_tempdir.symlink_to(rootdir / 'lib') + except: + if not lib_tempdir.is_symlink(): + raise + _cythonize = partial(cythonize, nthreads=int(os.environ.get('DRAGON_BUILD_NTHREADS', os.cpu_count())), show_all_warnings=True, @@ -94,6 +114,7 @@ def run(self): build_ext_options['inplace'] = ('setup script', 0) super().run() + lib_tempdir.unlink() class build_py(_build_py): @@ -127,7 +148,7 @@ def find_modules(self): package_dir = packages[package] except KeyError: package_dir = self.get_package_dir(package) - # We explicitly do not what to automatically add __init__.py, + # We explicitly do not want to automatically add __init__.py, # hence we pass None as the first argument, but we still want # to check the package_dir in order to properly resolve the # module_file below. @@ -170,6 +191,9 @@ def run(self): version=os.environ.get('DRAGON_VERSION', 'latest'), description="Python multiprocessing over the Dragon distributed runtime", packages=find_packages(), + package_data={ + 'dragon': ['lib/libdragon.so', 'lib/libpmod.so', 'lib/libpmsgqueue.so',] + }, ext_modules = extensions, entry_points=entry_points, python_requires=">=3.9", diff --git a/src/tools/dragon-cleanup b/src/tools/dragon-cleanup index 6f7ca9e..297cd93 100755 --- a/src/tools/dragon-cleanup +++ b/src/tools/dragon-cleanup @@ -33,7 +33,8 @@ echo $pids | xargs -r kill -9 if [ $# -eq 0 ]; then pids=`ps -U $me -ux | grep python | awk '$0 !~ /grep/' | awk '$0 !~ /bash/' | awk -F ' ' '{print $2}'` else - pids=`ps -U $me -ux | grep python | awk '$0 !~ /test_launcher/' | awk '$0 !~ /grep/' | awk '$0 !~ /bash/' | awk -F ' ' '{print $2}'` + pids=`ps -U $me -ux | grep python | awk '$0 !~ /test_launcher/' | awk '$0 !~ /grep/' | awk '$0 !~ /bash/' | \ + awk '$0 !~ /code-server/' | awk -F ' ' '{print $2}'` fi echo "Here are the Python PIDS to kill on login node": $pids echo $pids | xargs -r kill -9 diff --git a/test/ai/torch/README.md b/test/ai/torch/README.md new file mode 100644 index 0000000..c61260f --- /dev/null +++ b/test/ai/torch/README.md @@ -0,0 +1,20 @@ +# Dragon-PyTorch Test Cases + +## Reductions Test + +The following test checks that the Dragon PyTorch patch works. If the patch of init_reductions was not made then this will fail with with an error that Dragon does not support duplicate file descriptors. If that error was not raised, then we would see that the tensor has incorrect values. This test is inspired by PyTorch's `TestMultiprocessing._test_sharing` test located in `test/test_multiprocessing.py`. + +``` +dragon test_pytorch_patches.py PyTorchPatches.test_reductions_patch -f -v +``` + +## DataLoader Placement Test + + The following test checks that the processes spawned by the Dataloader are on the same node as the spawning process. The example involves CUDA, normalizing the MNIST dataset, loading the patched Dragon dataloader with the MNIST dataset, and running the training loop over the dataset hosted in the dataloader. + +For the following test, there should be a GPU allocation with 2 nodes. PyTorch needs to be installed for the type of GPU on the nodes. To confirm the test works, ssh into the node that is printed and confirm there are 5 `python3` processes started. Alternatively, ssh into the other nodes in the allocation other than the one printed and confirm there are no `python3` processes started. + +We run as follows: +``` +dragon test_pytorch_patches.py PyTorchPatches.test_placement -f -v +``` \ No newline at end of file diff --git a/test/ai/torch/test_pytorch_patches.py b/test/ai/torch/test_pytorch_patches.py new file mode 100644 index 0000000..e54534d --- /dev/null +++ b/test/ai/torch/test_pytorch_patches.py @@ -0,0 +1,90 @@ +from __future__ import print_function +import unittest +import dragon +import multiprocessing as mp +import dragon.ai.torch +import torch +from torchvision import datasets, transforms +import os +import socket +import inspect +from dragon.ai.torch.monkeypatching import dragon_fp_register +from multiprocessing.reduction import ForkingPickler + + +def simple_fill(queue, event): + data = queue.get() + data[0][:] = 4 + event.set() + +class PyTorchPatches(unittest.TestCase): + """The test is designed to run on GPU systems.""" + + def test_placement(self): + mp.set_start_method("dragon") + host_name = socket.gethostname() + print(f"Dataloader python processes should be on {host_name}", flush=True) + # Training settings + device = torch.device("cuda", 0) + train_kwargs = {"batch_size": 32} + cuda_kwargs = { + "num_workers": 4, + "pin_memory": True, + "shuffle": True, + "multiprocessing_context": "dragon", + "persistent_workers": True, + } + train_kwargs.update(cuda_kwargs) + + # normalize and scale the MNIST dataset + transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]) + + dataset1 = datasets.MNIST("./data", train=True, download=True, transform=transform) + + # create the dataloader for the MNIST dataset + train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs) + + # training loop + for batch_idx, (data, target) in enumerate(train_loader): + data, target = data.to(device), target.to(device) + self.assertIsInstance(data, torch.Tensor) + self.assertIsInstance(target, torch.Tensor) + + def test_reductions_patch(self): + mp.set_start_method("dragon") + os_patch = False + if "DRAGON_PATCH_TORCH" in os.environ: + os_patch = True + # check that the OS patch is Dragon Patch Torch + self.assertTrue(os_patch) + # check that the reductions and the file with reductions exist + self.assertTrue(ForkingPickler.register != classmethod(dragon_fp_register)) + self.assertTrue(torch.multiprocessing.reductions.init_reductions) + self.assertTrue(inspect.getfile(torch.multiprocessing.reductions.init_reductions)) + + x = torch.zeros(5, 5).to('cpu', torch.float) + q = mp.Queue() + e = mp.Event() + + data = [x, x[:, 1]] + q.put(data) + + p = mp.Process(target=simple_fill, args=(q, e)) + p.daemon = True + p.start() + + self.assertTrue(e.wait(10)) + #this is the opposite behavior of what torch tests + #torch_multiprocessing expects these to be in share + #memory and thus that the value is changed by the + #spawned process to a value of 4. + self.assertTrue(data[0].eq(0).all()) + self.assertTrue(data[1].eq(0).all()) + + p.join(100) + self.assertFalse(p.is_alive()) + + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/test/channels_subtests/.gitignore b/test/channels_subtests/.gitignore index dcfe819..9a1e062 100644 --- a/test/channels_subtests/.gitignore +++ b/test/channels_subtests/.gitignore @@ -8,3 +8,4 @@ test_wrong fch_test test_channelsets test_gateways +test_fli diff --git a/test/channels_subtests/Makefile b/test/channels_subtests/Makefile index 82a28bf..4015982 100644 --- a/test/channels_subtests/Makefile +++ b/test/channels_subtests/Makefile @@ -4,14 +4,14 @@ CFLAGS ?= -fPIC -Wall -Ofast -fomit-frame-pointer INCLUDE = -I $(DRAGON_INCLUDE_DIR) LIBS = -L $(DRAGON_LIB_DIR) -BIN_FILES = test_ch test_bch perf_fch test_send test_poll test_peek_pop test_channelsets test_wrong test_gateways test_gateway_messages +BIN_FILES = test_ch test_bch perf_fch test_send test_poll test_peek_pop test_channelsets test_wrong test_gateways test_gateway_messages test_fli %.c.o: %.c $(CC) $(INCLUDE) $(CFLAGS) -c $< -o $@ default: build -build: test_ch test_bch perf_fch test_send test_poll test_channelsets test_wrong test_gateways test_gateway_messages +build: test_ch test_bch perf_fch test_send test_poll test_channelsets test_wrong test_gateways test_gateway_messages test_fli test_ch: test_ch.c.o $(CC) $(INCLUDE) $(CFLAGS) -o test_ch $< $(LIBS) -ldragon -ldl @@ -34,6 +34,9 @@ perf_fch: perf_fch.c.o test_channelsets: test_channelsets.c.o $(CC) $(INCLUDE) $(CFLAGS) -o test_channelsets $< $(LIBS) -ldragon -ldl +test_fli: test_fli.c.o + $(CC) $(INCLUDE) $(CFLAGS) -o test_fli $< $(LIBS) -ldragon -ldl + test_gateways: test_gateways.c.o $(CC) $(INCLUDE) $(CFLAGS) -o test_gateways $< $(LIBS) -ldragon -ldl diff --git a/test/channels_subtests/test_basic_channels.py b/test/channels_subtests/test_basic_channels.py index 17376e4..ddd0e2d 100644 --- a/test/channels_subtests/test_basic_channels.py +++ b/test/channels_subtests/test_basic_channels.py @@ -4,6 +4,8 @@ import unittest import os import multiprocessing as mp +import dragon.infrastructure.parameters as dparms +from dragon.utils import B64 from dragon.channels import Channel, Message, ChannelSendH, ChannelRecvH, ChannelError, ChannelSendError, \ ChannelRecvError, OwnershipOnSend, LockType, FlowControl, ChannelFull, ChannelEmpty, ChannelHandleNotOpenError, \ ChannelRecvTimeout, ChannelSendTimeout, EventType, ChannelBarrierBroken, ChannelBarrierReady, MASQUERADE_AS_REMOTE, \ @@ -15,11 +17,11 @@ MAX_SPINNERS = 5 def worker_attach_detach(ch_ser, pool_ser): - ch = Channel.attach(ch_ser) + mpool = MemoryPool.attach(pool_ser) + ch = Channel.attach(ch_ser, mpool) sendh = ch.sendh() sendh.open() - mpool = MemoryPool.attach(pool_ser) msg = Message.create_alloc(mpool, 512) mview = msg.bytes_memview() mview[0:5] = b"Hello" @@ -46,10 +48,9 @@ def worker_pickled_attach(ch, mpool): def worker_send_recv(id, ch_ser, ch2_ser, pool_ser): try: - ch = Channel.attach(ch_ser) - ch2 = Channel.attach(ch2_ser) mpool = MemoryPool.attach(pool_ser) - + ch = Channel.attach(ch_ser, mpool) + ch2 = Channel.attach(ch2_ser, mpool) sendh = ch.sendh(wait_mode=SPIN_WAIT) sendh.open() @@ -68,17 +69,16 @@ def worker_send_recv(id, ch_ser, ch2_ser, pool_ser): sys.exit(0) except Exception as ex: - print(repr(ex)) + print(ex) sys.exit(1) def worker_fill_poll_empty(ch_ser, pool_ser): try: - ch = Channel.attach(ch_ser) + mpool = MemoryPool.attach(pool_ser) + ch = Channel.attach(ch_ser, mpool) sendh = ch.sendh() sendh.open() - mpool = MemoryPool.attach(pool_ser) - for j in range(3): for i in range(BARRIER_CHANNEL_CAPACITY): msg = Message.create_alloc(mpool, 512) @@ -92,7 +92,7 @@ def worker_fill_poll_empty(ch_ser, pool_ser): except Exception as ex: - print(repr(ex)) + print(ex) sys.exit(1) def worker_empty_poll_full(ch_ser): @@ -114,7 +114,7 @@ def worker_empty_poll_full(ch_ser): sys.exit(1) except Exception as ex: - print(repr(ex)) + print(ex) sys.exit(1) @@ -148,8 +148,8 @@ def worker_barrier_wait(ch_ser): sys.exit(2) except Exception as ex: - #print("There was an exception while running the barrier wait process.", flush=True) - #print(repr(ex), flush=True) + print("There was an exception while running the barrier wait process.", flush=True) + print(ex, flush=True) sys.exit(1) @@ -460,11 +460,18 @@ class ChannelTests(unittest.TestCase): @classmethod def setUpClass(cls): - pool_name = f"pydragon_channel_test_{os.getpid}" + pool_name = f"pydragon_channel_test_{os.getpid()}" pool_size = 1073741824 # 1GB pool_uid = 1 cls.mpool = MemoryPool(pool_size, pool_name, pool_uid) + # This is wrong below. There appears to be a problem where the check for whether a pool + # is local in a channel is not returning true when it should. This may be based on hostid. + # # This must be done because the channels attach expects to find + # # an environment where it can find a default pool to attach to. + # pool_ser = B64.bytes_to_str(cls.mpool.serialize()) + # os.environ[dparms.this_process.default_pd] = pool_ser + @classmethod def tearDownClass(cls): cls.mpool.destroy() @@ -564,7 +571,8 @@ def test_poll_barrier2(self): for i in range(BARRIER_CHANNEL_CAPACITY*2): proc_list[i].join() self.assertEqual(proc_list[i].exitcode, 0, 'Non-zero exitcode from barrier proc') - except: + except Exception as ex: + print(ex) print('**** Error in test_poll_barrier2') print(f'*** Receivers={self.ch2.blocked_receivers}') print(f'*** Number of messages={self.ch2.num_msgs}') @@ -613,7 +621,7 @@ def test_poll_barrier3_with_abort(self): self.ch2.poll(event_mask=EventType.POLLRESET) except Exception as ex: - print(repr(ex)) + print(ex) print('**** Error in test_poll_barrier3_with_abort') print(f'*** Receivers={self.ch2.blocked_receivers}') print(f'*** Number of messages={self.ch2.num_msgs}') @@ -1040,8 +1048,9 @@ def test_create_destroy(self): msg.bytes_memview() def test_zero_size(self): - with self.assertRaises(ChannelError): - Message.create_alloc(self.mpool, 0) + msg = Message.create_alloc(self.mpool, 0) + self.assertEqual(len(msg.tobytes()), 0) + self.assertEqual(len(msg.bytes_memview()), 0) def test_negative_size(self): with self.assertRaises(OverflowError): @@ -1062,11 +1071,11 @@ def worker_send_to_chset(ch_ser, pool_ser): # polling on the channel set try: - ch = Channel.attach(ch_ser) + mpool = MemoryPool.attach(pool_ser) + ch = Channel.attach(ch_ser, mpool) sendh = ch.sendh() sendh.open() - mpool = MemoryPool.attach(pool_ser) msg = Message.create_alloc(mpool, 32) mview = msg.bytes_memview() mview[0:5] = b"Hello" @@ -1076,14 +1085,14 @@ def worker_send_to_chset(ch_ser, pool_ser): sendh.close() except Exception as ex: - print(repr(ex)) + print(ex) class ChannelSetTests(unittest.TestCase): @classmethod def setUpClass(cls): - pool_name = f"pydragon_channelset_test_{os.getpid}" + pool_name = f"pydragon_channelset_test_{os.getpid()}" pool_size = 1073741824 # 1GB pool_uid = 1 cls.mpool = MemoryPool(pool_size, pool_name, pool_uid) diff --git a/test/channels_subtests/test_fli.c b/test/channels_subtests/test_fli.c new file mode 100644 index 0000000..2866c98 --- /dev/null +++ b/test/channels_subtests/test_fli.c @@ -0,0 +1,708 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../_ctest_utils.h" + +#define M_UID 0 +#define POOL_M_UID 2 +#define POOL "fli_test" +#define NUM_CHANNELS 10 +#define MAX_STREAM_SIZE 500 + +const static char* text = "This is some text to compare to so we know that this thing works!"; + +void +check_result(dragonError_t err, dragonError_t expected_err, int* tests_passed, int* tests_attempted) +{ + (*tests_attempted)++; + + if (err != expected_err) { + printf("Test %d Failed with error code %s\n", *tests_attempted, dragon_get_rc_string(err)); + printf("%s\n", dragon_getlasterrstr()); + exit(0); + } + else + (*tests_passed)++; +} + +dragonError_t create_pool(dragonMemoryPoolDescr_t* mpool) { + /* Create a memory pool to allocate messages and a Channel out of */ + size_t mem_size = 1UL<<31; + + dragonError_t err = dragon_memory_pool_create(mpool, mem_size, POOL, POOL_M_UID, NULL); + if (err != DRAGON_SUCCESS) + err_fail(err, "Failed to create memory pool"); + + return DRAGON_SUCCESS; +} + +dragonError_t create_channels(dragonMemoryPoolDescr_t* mpool, dragonChannelDescr_t channel[], int arr_size) { + int k; + dragonError_t err; + + for (k=0;k 5) { + snprintf(err_str, 199, "%lu bytes were received while only 5 were requested in proc_receiver.", num_bytes); + err_fail(DRAGON_FAILURE, err_str); + } + strncat(str, buff, num_bytes); + } + } + + if (err != DRAGON_EOT) + err_fail(err, "There was an error reading the string"); + + if (err != DRAGON_EOT) + err_fail(err, "There was an error reading the string"); + + if (strcmp(text, str)) + err_fail(DRAGON_INVALID_ARGUMENT, "There was an error with the received string."); + + err = dragon_fli_close_recv_handle(&recvh, NULL); + if (err != DRAGON_SUCCESS) + err_fail(err, "There was an error closing the receive handle"); + + return DRAGON_SUCCESS; +} + +int proc_receiver(dragonFLISerial_t* serial, dragonMemoryPoolDescr_t* pool, dragonChannelDescr_t* strm_ch, size_t requested_size) { + dragonError_t err; + dragonFLIDescr_t fli; + dragonFLIRecvHandleDescr_t recvh; + size_t num_bytes = 0; + uint64_t arg; + uint8_t* bytes; + char str[MAX_STREAM_SIZE]; + char err_str[200]; + str[0] = '\0'; + + + err = dragon_fli_attach(serial, pool, &fli); + if (err != DRAGON_SUCCESS) + err_fail(err, "Failed to attach to adapter"); + + err = dragon_fli_open_recv_handle(&fli, &recvh, strm_ch, NULL); + if (err != DRAGON_SUCCESS) + err_fail(err, "Failed to open receive handle"); + + err = DRAGON_SUCCESS; + while (err == DRAGON_SUCCESS) { + err = dragon_fli_recv_bytes(&recvh, requested_size, &num_bytes, &bytes, &arg, NULL); + if (err == DRAGON_SUCCESS) { + if (requested_size > 0 && num_bytes > requested_size) { + snprintf(err_str, 199, "%lu bytes were received while only %lu were requested in proc_receiver.", num_bytes, requested_size); + err_fail(DRAGON_FAILURE, err_str); + } + strncat(str, (char*)bytes, num_bytes); + free(bytes); + } + } + + if (strcmp(text, str)) + err_fail(DRAGON_INVALID_ARGUMENT, "There was an error with the received string."); + + err = dragon_fli_close_recv_handle(&recvh, NULL); + if (err != DRAGON_SUCCESS) + err_fail(err, "There was an error closing the receive handle"); + + return DRAGON_SUCCESS; +} + +int proc_receiver_inc_chunk(dragonFLISerial_t* serial, dragonMemoryPoolDescr_t* pool, dragonChannelDescr_t* strm_ch, size_t requested_size) { + dragonError_t err; + dragonFLIDescr_t fli; + dragonFLIRecvHandleDescr_t recvh; + size_t num_bytes = 0; + uint64_t arg; + uint8_t* bytes; + char str[MAX_STREAM_SIZE]; + char err_str[200]; + str[0] = '\0'; + + err = dragon_fli_attach(serial, pool, &fli); + if (err != DRAGON_SUCCESS) + err_fail(err, "Failed to attach to adapter"); + + err = dragon_fli_open_recv_handle(&fli, &recvh, strm_ch, NULL); + if (err != DRAGON_SUCCESS) + err_fail(err, "Failed to open receive handle"); + + err = DRAGON_SUCCESS; + while (err == DRAGON_SUCCESS) { + err = dragon_fli_recv_bytes(&recvh, requested_size, &num_bytes, &bytes, &arg, NULL); + if (err == DRAGON_SUCCESS) { + if (requested_size > 0 && num_bytes > requested_size) { + snprintf(err_str, 199, "%lu bytes were received while only %lu were requested in proc_receiver_inc_chunk.", num_bytes, requested_size); + err_fail(DRAGON_FAILURE, err_str); + } + strncat(str, (char*)bytes, num_bytes); + free(bytes); + } + requested_size+=1; + } + + if (err != DRAGON_EOT) + err_fail(err, "There was an error reading the string"); + + if (strcmp(text, str)) + err_fail(DRAGON_INVALID_ARGUMENT, "There was an error with the received string."); + + err = dragon_fli_close_recv_handle(&recvh, NULL); + if (err != DRAGON_SUCCESS) + err_fail(err, "There was an error closing the receive handle"); + + return DRAGON_SUCCESS; +} + +int proc_receiver_fd(dragonFLISerial_t* serial, dragonMemoryPoolDescr_t* pool) { + dragonError_t err; + dragonFLIDescr_t fli; + dragonFLIRecvHandleDescr_t recvh; + size_t num_bytes = 0; + size_t read_bytes = 1; + char str[MAX_STREAM_SIZE]; + int fd; + + err = dragon_fli_attach(serial, pool, &fli); + if (err != DRAGON_SUCCESS) + err_fail(err, "Failed to attach to adapter"); + + err = dragon_fli_open_recv_handle(&fli, &recvh, NULL, NULL); + if (err != DRAGON_SUCCESS) + err_fail(err, "Failed to open receive handle"); + + err = dragon_fli_create_readable_fd(&recvh, &fd, NULL); + if (err != DRAGON_SUCCESS) + err_fail(err, "Failed to open receive fd handle"); + + err = DRAGON_SUCCESS; + + while (read_bytes > 0) { + read_bytes = read(fd, &str[num_bytes], MAX_STREAM_SIZE - num_bytes); + num_bytes += read_bytes; + } + + if (strcmp(text, str)) + err_fail(DRAGON_INVALID_ARGUMENT, "There was an error with the received string from the file descriptor."); + + close(fd); + + err = dragon_fli_finalize_readable_fd(&recvh); + if (err != DRAGON_SUCCESS) + err_fail(err, "There was an error finalizing the readable fd"); + + err = dragon_fli_close_recv_handle(&recvh, NULL); + if (err != DRAGON_SUCCESS) + err_fail(err, "There was an error closing the receive handle"); + + return DRAGON_SUCCESS; +} + +int main() { + dragonError_t err; + int tests_passed = 0; + int tests_attempted = 0; + + dragonMemoryPoolDescr_t pool; + dragonChannelDescr_t channels[NUM_CHANNELS]; + dragonChannelDescr_t* channel_ptrs[NUM_CHANNELS]; + dragonFLISerial_t ser; + dragonFLIDescr_t fli; + dragonFLISendHandleDescr_t sendh; + dragonFLIRecvHandleDescr_t recvh; + size_t num_bytes; + uint8_t* bytes = NULL; + uint64_t arg; + int status; + size_t chunk_size = 1; + int fd; + + /* Creating the channel set and the pool */ + for (int k=0;k 0: + x = r.read() + s += x + + r.close() + + if s != expected: + print(f'The expected string as {expected} and received {s} instead!') + return -1 + + recvh.finalize_fd() + + recvh.close() + + return 0 + except Exception as ex: + print(f"GOT EXCEPTION: {ex}") + +def echo(fli_in, fli_out): + sendh = fli_out.sendh() + recvh = fli_in.recvh() + + (x, hint) = recvh.recv_bytes() # recv_bytes returns a tuple, first the bytes then the message attribute + try: + _ = recvh.recv_bytes() + print('Did not get EOT as expected', flush=True ) + except EOFError: + pass + recvh.close() + sendh.send_bytes(x, hint) + sendh.close() + + +class FLISendRecvTest(unittest.TestCase): + + @classmethod + def setUpClass(cls): + pass + + @classmethod + def tearDownClass(cls): + pass + + def setUp(self): + pool_name = f"pydragon_fli_test_{os.getpid()}" + pool_size = 1073741824 # 1GB + pool_uid = 1 + self.mpool = MemoryPool(pool_size, pool_name, pool_uid) + self.main_ch = Channel(self.mpool, 1) + self.manager_ch = Channel(self.mpool, 2) + self.stream_chs = [] + for i in range(5): + self.stream_chs.append(Channel(self.mpool, 3+i)) + + self.fli = FLInterface(main_ch=self.main_ch, manager_ch=self.manager_ch, pool=self.mpool, stream_channels=self.stream_chs) + + def tearDown(self): + self.fli.destroy() + for i in range(5): + self.stream_chs[i].destroy() + self.mpool.destroy() + + def test_create_close_send_handle(self): + sendh = self.fli.sendh() + sendh.close() + + @unittest.skip("Hangs indefinitely on close") + def test_create_close_recv_handle(self): + recvh = self.fli.recvh() + recvh.close() + + def test_send_recv_bytes(self): + sendh = self.fli.sendh() + recvh = self.fli.recvh() + + b = b'Hello World' + sendh.send_bytes(b) + sendh.close() + (x, _) = recvh.recv_bytes() # recv_bytes returns a tuple, first the bytes then the message attribute + self.assertEqual(b, x) + + with self.assertRaises(FLIEOT): + (x, _) = recvh.recv_bytes() # We should get back an EOT here + recvh.close() + + def test_send_recv_mem(self): + sendh = self.fli.sendh() + recvh = self.fli.recvh() + + mem = self.mpool.alloc(512) + mview = mem.get_memview() + mview[0:5] = b'Hello' + + sendh.send_mem(mem) + sendh.close() + (recv_mem, _) = recvh.recv_mem() + + mview2 = recv_mem.get_memview() + self.assertEqual(b'Hello', mview2[0:5]) + + with self.assertRaises(FLIEOT): + _ = recvh.recv_mem() + recvh.close() + + def test_send_recv_bytes_buffer(self): + pass + + def test_send_bytes_recv_mem(self): + sendh = self.fli.sendh() + recvh = self.fli.recvh() + + b = b'Hello' + sendh.send_bytes(b) + sendh.close() + (x, _) = recvh.recv_mem() + mview = x.get_memview() + self.assertEqual(b'Hello', bytes(mview[0:5])) + + with self.assertRaises(FLIEOT): + _ - recvh.recv_mem() + recvh.close() + + def test_send_recv_direct(self): + stream = Channel(self.mpool, 9999) + sendh = self.fli.sendh(stream) + recvh = self.fli.recvh() + + b = b'Hello World' + sendh.send_bytes(b) + sendh.close() + + (x, _) = recvh.recv_bytes() + self.assertEqual(b'Hello World', x) + + with self.assertRaises(FLIEOT): + _ = recvh.recv_bytes() + recvh.close() + + stream.destroy() + + def test_create_close_write_file(self): + sendh = self.fli.sendh() + fdes = sendh.create_fd() + + f = os.fdopen(fdes, 'w') + f.write("Test") + f.close() + sendh.finalize_fd() + sendh.close() + + def test_read_write_file(self): + fli_ser = self.fli.serialize() + test_string = 'Hello World' + p = mp.Process(target=worker_recv_fd, args=(fli_ser, self.mpool, test_string)) + p.start() + sendh = self.fli.sendh() + fdes = sendh.create_fd() + f = os.fdopen(fdes, 'w') + f.write(test_string) + f.close() + sendh.finalize_fd() + sendh.close() + p.join() + + def test_pass_fli(self): + main2_ch = Channel(self.mpool, 101) + manager2_ch = Channel(self.mpool, 102) + stream2_chs = [] + for i in range(5): + stream2_chs.append(Channel(self.mpool, 103+i)) + + fli2 = FLInterface(main_ch=main2_ch, manager_ch=manager2_ch, pool=self.mpool, stream_channels=stream2_chs) + + proc = mp.Process(target=echo, args=(self.fli, fli2)) + proc.start() + sendh = self.fli.sendh() + recvh = fli2.recvh() + b = b'Hello World' + sendh.send_bytes(b, 42) + sendh.close() + x, hint = recvh.recv_bytes() + self.assertEqual(x, b) + self.assertEqual(42, hint) + proc.join() + + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/test/channels_subtests/test_peek_pop.c b/test/channels_subtests/test_peek_pop.c index f666ffc..d8920df 100644 --- a/test/channels_subtests/test_peek_pop.c +++ b/test/channels_subtests/test_peek_pop.c @@ -287,7 +287,6 @@ int main(int argc, char *argv[]) } - //dbg comment explaining this err = dragon_channel_message_destroy(&peek_msg, 1); if (err != DRAGON_SUCCESS) err_fail(err, "Failed to destroy peek message"); @@ -296,7 +295,6 @@ int main(int argc, char *argv[]) } - //dbg comment explaining this err = dragon_channel_message_destroy(&send_msg, 1); if (err != DRAGON_SUCCESS) err_fail(err, "Failed to destroy peek message"); @@ -383,7 +381,6 @@ int main(int argc, char *argv[]) } - //dbg comment explaining this err = dragon_channel_message_destroy(&peek_msg, 1); if (err != DRAGON_SUCCESS) { char * errstr = dragon_getlasterrstr(); diff --git a/test/infrastructure/test_gpu_desc.py b/test/infrastructure/test_gpu_desc.py new file mode 100644 index 0000000..71d5091 --- /dev/null +++ b/test/infrastructure/test_gpu_desc.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 + +import unittest + +import dragon.globalservices.policy_eval as dgpol +from dragon.infrastructure.gpu_desc import AcceleratorDescriptor, AccVendor, AccEnvStr +from dragon.infrastructure.policy import Policy, GS_DEFAULT_POLICY +from dragon.infrastructure.node_desc import NodeDescriptor + +class TestGpuDesc(unittest.TestCase): + + def setUp(self): + self.nvidia = AcceleratorDescriptor(vendor = AccVendor.NVIDIA, + device_list = [0,1,2], + env_str = AccEnvStr.NVIDIA) + self.amd = AcceleratorDescriptor(vendor = AccVendor.AMD, + device_list = [0,1,2], + env_str = AccEnvStr.AMD) + self.node = NodeDescriptor() + self.node.host_name = "mock_node" + self.node.host_id = 1 + self.node.h_uid = 1 + self.node.num_cpus = 8 + + def test_nvidia(self): + self.node.accelerators = self.nvidia + p = Policy(gpu_affinity=[1]) # Make sure we get the proper device other than 0 + eval = dgpol.PolicyEvaluator([self.node], GS_DEFAULT_POLICY) + layout = eval.evaluate([p]) + self.assertEqual(layout.accelerator_env, self.nvidia.env_str) + self.assertEqual(layout.gpu_core, [1]) + + def test_amd(self): + self.node.accelerators = self.amd + p = Policy(gpu_affinity=[1]) # Make sure we get the proper device other than 0 + eval = dgpol.PolicyEvaluator([self.node], GS_DEFAULT_POLICY) + layout = eval.evaluate([p]) + self.assertEqual(layout.accelerator_env, self.amd.env_str) + self.assertEqual(layout.gpu_core, [1]) + \ No newline at end of file diff --git a/test/infrastructure/test_policy.py b/test/infrastructure/test_policy.py new file mode 100644 index 0000000..2acc5b3 --- /dev/null +++ b/test/infrastructure/test_policy.py @@ -0,0 +1,35 @@ +import unittest +from dataclasses import asdict, dataclass, field +from dragon.infrastructure.policy import Policy +import threading + + +class PolicyTester(unittest.TestCase): + def testit(): + with Policy(distribution=Policy.Distribution.BLOCK): + # In infrastructure.process, channel, and pool you can use the code that I provided from + # infrastructure.process to pass the top of the stack in the messages sent to global services. + # For instance, you would access the top like I have below if the policy is not None that was + # passed in (see process.py in the branch I pushed). Don't bother changing api_setup.py, the + # better spot to have the policy stack is in policy.py. + self.assertInstance(Policy.global_policy(), dragon.infrastructure.policy.Policy) + + self.assertInstance(Policy.global_policy(), dragon.infrastructure.policy.Policy) + + def test_policy(self): + with Policy(distribution=Policy.Distribution.DEFAULT): + # In infrastructure.process, channel, and pool you can use the code that I provided from + # infrastructure.process to pass the top of the stack in the messages sent to global services. + # For instance, you would access the top like I have below if the policy is not None that was + # passed in (see process.py in the branch I pushed). Don't bother changing api_setup.py, the + # better spot to have the policy stack is in policy.py. + self.assertInstance(Policy.global_policy(), dragon.infrastructure.policy.Policy) + t = threading.Thread(target=testit, args=()) + t.start() + t.join() + self.assertInstance(Policy.global_policy(), dragon.infrastructure.policy.Policy) + + self.assertInstance(Policy.global_policy(), dragon.infrastructure.policy.Policy) + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/test/launcher/backend_testing_mocks.py b/test/launcher/backend_testing_mocks.py index ba8ebbd..1adfe4e 100644 --- a/test/launcher/backend_testing_mocks.py +++ b/test/launcher/backend_testing_mocks.py @@ -9,6 +9,7 @@ from dragon.infrastructure import facts as dfacts from dragon.infrastructure import messages as dmsg from dragon.infrastructure import process_desc +from dragon.infrastructure.node_desc import NodeDescriptor from dragon.infrastructure.connection import Connection, ConnectionOptions from dragon.infrastructure.parameters import this_process, POLICY_INFRASTRUCTURE from dragon.infrastructure.messages import AbnormalTerminationError @@ -71,12 +72,14 @@ def mocked_start_localservices(*args, **kwargs): class LauncherBackendHelper: def __init__(self, - network_config, + network_config_file, node_index, ip_addrs, fe_host_id, be_host_id): - self.network_config = network_config + self.network_config_file = network_config_file + self.network_config = \ + NetworkConfig.from_file(self.network_config_file).get_network_config() self.node_index = node_index self.ip_addrs = ip_addrs self.fe_host_id = fe_host_id @@ -132,15 +135,16 @@ def handle_overlay_start(self, mock_overlay, mock_start_localservices): self.launcher_fe.connect_to_be() self.launcher_fe.send_FENodeIdxBE() # M9 + self.be_overlay_tsta.recv_TAUpdateNodes() + # Get generated LS stdin/stdout file descriptors. Turn them into newline stream wrappers _, self.ls_stdin_queue, self.ls_stdout_queue = mock_start_localservices def handle_network_and_frontend_start(self, mock_network_config): '''Define the mocked up network and frontend''' - net = NetworkConfig.from_file(self.network_config) - mock_network_config.return_value = net.get_network_config()[str(self.node_index)] + mock_network_config.return_value = self.network_config[str(self.node_index)] - self.launcher_fe = LauncherFrontEnd(self.node_index, self.fe_host_id) + self.launcher_fe = LauncherFrontEnd(self.node_index, self.fe_host_id, self.network_config) args_map = get_args_map( self.network_config, @@ -156,7 +160,8 @@ def start_ls(self, mock_start_localservices, garble_shchannelsup=False, garble_lachannelsinfo=False, - abort_lachannelsinfo=False): + abort_lachannelsinfo=False, + accelerator_present=False): self.handle_overlay_start(mock_overlay, mock_start_localservices) @@ -172,6 +177,8 @@ def start_ls(self, if garble_shchannelsup: self.localservices.send_SHChannelsUP(custom_msg=json.dumps(self.garbage_dict)) return + elif accelerator_present: + self.localservices.send_SHChannelsUP(accelerator_present=accelerator_present) # M13 else: self.localservices.send_SHChannelsUP() # M13 @@ -192,10 +199,15 @@ def start_ls(self, self.localservices.recv_LAChannelsInfo() # M16 - def clean_startup(self, log, mock_overlay, mock_network_config, mock_start_localservices): + def clean_startup(self, + log, + mock_overlay, + mock_network_config, + mock_start_localservices, + accelerator_present=False): '''Execute a clean bringup and exit of all backend services''' - self.start_ls(mock_overlay, mock_start_localservices) + self.start_ls(mock_overlay, mock_start_localservices, accelerator_present=accelerator_present) # A11 Local Services spawns TA @@ -357,10 +369,11 @@ def cleanup_ls(self): class LauncherFrontEnd: - def __init__(self, node_index, host_id): + def __init__(self, node_index, host_id, network_config): self.log = logging.getLogger("_launcher_fe") self.node_index = node_index self.host_id = host_id + self.network_config = network_config self.port = dfacts.DEFAULT_TRANSPORT_PORT self.num_gw_channels = dfacts.DRAGON_OVERLAY_DEFAULT_NUM_GW_CHANNELS_PER_NODE @@ -469,7 +482,21 @@ def connect_to_be(self): self.log.debug("connected to be") def send_FENodeIdxBE(self): - fe_node_idx = dmsg.FENodeIdxBE(tag=next_tag(), node_index=self.node_index) + # Pack up all of our node descriptors for the backend: + forwarding = {} + for be_up in [self.be_is_up]: + assert isinstance(be_up, dmsg.BEIsUp), 'la_fe received invalid be up' + for key, node_desc in self.network_config.items(): + if str(be_up.host_id) == str(node_desc.host_id): + forwarding[key] = NodeDescriptor(host_id=int(node_desc.host_id), + ip_addrs=node_desc.ip_addrs, + overlay_cd=be_up.be_ch_desc) + break + fe_node_idx = dmsg.FENodeIdxBE( + tag=next_tag(), + node_index=self.node_index, + forward=forwarding, + send_desc=self.encoded_inbound) self.conn_out.send(fe_node_idx.serialize()) self.log.debug(f"send_FENodeIdxBE sent {fe_node_idx=}") @@ -497,8 +524,9 @@ def recv_SHChannelsUp(self): def send_LAChannelsInfo(self, custom_msg: str = None): if custom_msg is None: + nodes_desc = {ch_up.idx: ch_up.node_desc for ch_up in self.chs_up} la_ch_info = dmsg.LAChannelsInfo(tag=next_tag(), - nodes_desc=self.chs_up, + nodes_desc=nodes_desc, gs_cd=self.gs_cd, num_gw_channels=self.num_gw_channels, port=self.port, @@ -644,6 +672,11 @@ def send_OverlayPingBE(self): self.be_ta_conn.send(overlay_ping_be.serialize()) self.log.debug(f"send_OverlayPingBE sent {overlay_ping_be=}") + def recv_TAUpdateNodes(self): + be_hsta_update_nodes = get_with_blocking(self.be_ta_conn) + assert isinstance(be_hsta_update_nodes, dmsg.TAUpdateNodes), "expected TAUpdateNodes" + self.log.debug(f"recv_TAUpdateNodes got {be_hsta_update_nodes=}") + def recv_BEHaltOverlay(self): # M22 be_halt_overlay = get_with_blocking(self.be_ta_conn) assert isinstance(be_halt_overlay, dmsg.BEHaltOverlay), "expected BEHaltOverlay" @@ -748,16 +781,26 @@ def recv_BENodeIdxSH(self): assert isinstance(self.be_node_idx_sh, dmsg.BENodeIdxSH), "expected BENodeIdxSH" self.log.debug(f"recv_BENodeIdxSH got {self.be_node_idx_sh=}") - def send_SHChannelsUP(self, custom_msg=None): + @patch("dragon.infrastructure.gpu_desc.find_nvidia") + def send_SHChannelsUP(self, + mock_find_nvidia, + custom_msg=None, + accelerator_present=False): if custom_msg: self.la_input.send(custom_msg) else: + mock_find_nvidia.return_value = None + if accelerator_present: + mock_find_nvidia.return_value = (0, 1, 2, 3) + + node_desc = NodeDescriptor.get_localservices_node_conf(host_name=self.be_node_idx_sh.host_name, + name=self.be_node_idx_sh.host_name, + host_id=self.host_id, + ip_addrs=self.be_node_idx_sh.ip_addrs, + shep_cd=this_process.local_shep_cd) ch_up_msg = dmsg.SHChannelsUp(tag=next_tag(), - host_name=self.be_node_idx_sh.host_name, - host_id=self.host_id, - ip_addrs=self.be_node_idx_sh.ip_addrs, - shep_cd=this_process.local_shep_cd, + node_desc=node_desc, gs_cd=self.gs_cd, idx=self.be_node_idx_sh.node_idx) diff --git a/test/launcher/frontend_testing_mocks.py b/test/launcher/frontend_testing_mocks.py index eb40a3e..92c2e8d 100644 --- a/test/launcher/frontend_testing_mocks.py +++ b/test/launcher/frontend_testing_mocks.py @@ -7,6 +7,7 @@ from dragon.infrastructure.process_desc import ProcessDescriptor from dragon.infrastructure.connection import Connection, ConnectionOptions +from dragon.infrastructure.node_desc import NodeDescriptor from dragon.infrastructure.parameters import POLICY_INFRASTRUCTURE from dragon.infrastructure import facts as dfacts from dragon.infrastructure import messages as dmsg @@ -139,12 +140,12 @@ def send_shchannelsup(nodes, mpool): else: node['gs_ch'] = None gs_cd = None - + node_desc = NodeDescriptor(host_name=node['hostname'], + host_id=host_id, + ip_addrs=node['ip_addrs'], + shep_cd=B64.bytes_to_str(node['ls_ch'].serialize())) ch_up_msg = dmsg.SHChannelsUp(tag=next_tag(), - host_name=node['hostname'], - host_id=host_id, - ip_addrs=node['ip_addrs'], - shep_cd=B64.bytes_to_str(node['ls_ch'].serialize()), + node_desc=node_desc, gs_cd=gs_cd, idx=node['node_index']) log.info(f'construct SHChannelsUp: {ch_up_msg}') @@ -194,6 +195,23 @@ def handle_gsprocesscreate(primary_conn): primary_conn.send(response.serialize()) +def handle_gsprocesscreate_error(primary_conn): + '''Indicate an error with GSProcessCreate''' + + log = logging.getLogger('handle_gsprocesscreate_error') + proc_create = dmsg.parse(primary_conn.recv()) + log.info('presumably got GSProcessCreate') + assert isinstance(proc_create, dmsg.GSProcessCreate) + log.info('recvd GSProccessCreate') + + # Send error response + response = dmsg.GSProcessCreateResponse(tag=next_tag(), + ref=proc_create.tag, + err=dmsg.GSProcessCreateResponse.Errors.FAIL, + err_info="Error starting Head Process") + primary_conn.send(response.serialize()) + + def stand_up_backend(mock_overlay, mock_launch, network_config): log = logging.getLogger('mock_backend_standup') diff --git a/test/launcher/test_backend_bringup.py b/test/launcher/test_backend_bringup.py index eb1f8f4..c34011e 100644 --- a/test/launcher/test_backend_bringup.py +++ b/test/launcher/test_backend_bringup.py @@ -9,8 +9,6 @@ from unittest.mock import patch from dragon.infrastructure.facts import PROCNAME_LA_BE -from dragon.infrastructure.messages import AbnormalTerminationError - from dragon.launcher.backend import LauncherBackEnd from .launcher_testing_utils import catch_thread_exceptions from .backend_testing_mocks import LauncherBackendHelper, mock_start_localservices_wrapper @@ -71,6 +69,8 @@ def tearDown(self): log.removeHandler(handler) handler.close() + del self.be_helper + def start_backend_thread(self, args_map): # get startup going in another thread. Note: need to do threads @@ -99,6 +99,29 @@ def test_clean_exit(self, mock_localservices_tuple, mock_network_config, mock_ov # join on the be thread self.be_thread.join() + @mock_start_localservices_wrapper + @patch("dragon.launcher.backend.start_overlay_network") + @patch("dragon.launcher.backend.NodeDescriptor.get_local_node_network_conf") + def test_accelerators_present(self, mock_localservices_tuple, mock_network_config, mock_overlay): + """Test that we can come up and exit cleanly""" + + log = logging.getLogger("clean_be_startup_teardown") + + args_map = self.be_helper.handle_network_and_frontend_start(mock_network_config) + self.start_backend_thread(args_map) + + self.be_helper.clean_startup(log, + mock_overlay, + mock_network_config, + mock_localservices_tuple, + accelerator_present=True) + self.be_helper.launch_user_process(log) + self.be_helper.clean_shutdown(log) + self.be_helper.cleanup() + + # join on the be thread + self.be_thread.join() + @catch_thread_exceptions @mock_start_localservices_wrapper @patch("dragon.launcher.backend.start_overlay_network") diff --git a/test/launcher/test_frontend_bringup.py b/test/launcher/test_frontend_bringup.py index 8c72edd..caa687c 100644 --- a/test/launcher/test_frontend_bringup.py +++ b/test/launcher/test_frontend_bringup.py @@ -24,7 +24,13 @@ from .frontend_testing_mocks import run_frontend, open_backend_comms, open_overlay_comms from .frontend_testing_mocks import send_beisup, handle_teardown, recv_fenodeidx, send_shchannelsup, recv_lachannelsinfo -from .frontend_testing_mocks import handle_gsprocesscreate, handle_bringup, stand_up_backend, handle_overlay_teardown +from .frontend_testing_mocks import ( + handle_gsprocesscreate, + handle_gsprocesscreate_error, + handle_bringup, + stand_up_backend, + handle_overlay_teardown +) from .frontend_testing_mocks import send_abnormal_term @@ -165,6 +171,72 @@ def test_clean_exit(self, mock_overlay, mock_launch): # Join on the frontend thread fe_proc.join() + @unittest.skip('HSTA not currently supported in open source.') + @catch_thread_exceptions + @patch('dragon.launcher.frontend.LauncherFrontEnd._launch_backend') + @patch('dragon.launcher.frontend.start_overlay_network') + def test_error_launching_head_process(self, exceptions_caught_in_threads, mock_overlay, mock_launch): + '''Test error launching head process''' + + args_map = get_args_map(self.network_config) + + # get startup going in another thread. Note: need to do threads in order to use + # all our mocks + fe_proc = threading.Thread(name='Frontend Server', + target=run_frontend, + args=(args_map,), + daemon=False) + fe_proc.start() + + # Get backend up + la_info = self.do_bringup(mock_overlay, mock_launch) + + # Check we launched the backend with default transport + self.assertEqual(la_info.transport, TransportAgentOptions.HSTA) + + # Receive GSProcessCreate + handle_gsprocesscreate_error(self.primary_conn) + + # Join on the frontend thread + fe_proc.join() + + logging.info(f"exception: {exceptions_caught_in_threads}") + assert 'Frontend Server' in exceptions_caught_in_threads # there was an exception in thread 1 + assert exceptions_caught_in_threads['Frontend Server']['exception']['type'] == RuntimeError + assert str(exceptions_caught_in_threads['Frontend Server']['exception']['value']) == 'Abnormal exit detected' + + @unittest.skip('HSTA not currently supported in open source.') + @patch('dragon.launcher.frontend.LauncherFrontEnd._launch_backend') + @patch('dragon.launcher.frontend.start_overlay_network') + def test_clean_exit_with_hsta_launch(self, mock_overlay, mock_launch): + '''Test a clean bring-up and teardown with HSTA''' + + args_map = get_args_map(self.network_config, + arg1=['-t', 'hsta']) + + # get startup going in another thread. Note: need to do threads in order to use + # all our mocks + fe_proc = threading.Thread(name='Frontend Server', + target=run_frontend, + args=(args_map,), + daemon=False) + fe_proc.start() + + # Get backend up + la_info = self.do_bringup(mock_overlay, mock_launch) + + # Check we launched the backend with default transport + self.assertEqual(la_info.transport, TransportAgentOptions.HSTA) + + # Receive GSProcessCreate + handle_gsprocesscreate(self.primary_conn) + + # Send GSHalted + handle_teardown(self.be_nodes, self.primary_conn, self.fe_ta_conn) + + # Join on the frontend thread + fe_proc.join() + @patch('dragon.launcher.frontend.LauncherFrontEnd._launch_backend') @patch('dragon.launcher.frontend.start_overlay_network') def test_clean_exit_with_tcp_launch(self, mock_overlay, mock_launch): @@ -315,6 +387,7 @@ def test_launch_overlay_exception(self, exceptions_caught_in_threads, mock_overl assert str(exceptions_caught_in_threads['Frontend Server']['exception']['value']) \ == 'Overlay transport agent launch failed on launcher frontend' + @unittest.skip("Skipped pending fix of problem outlined in CIRRUS-1922. This test sporadically fails.") @catch_thread_exceptions @patch('dragon.launcher.frontend.LauncherFrontEnd._launch_backend') @patch('dragon.launcher.frontend.start_overlay_network') diff --git a/test/launcher/test_signal_handling.py b/test/launcher/test_signal_handling.py index aeb0e93..b64945b 100644 --- a/test/launcher/test_signal_handling.py +++ b/test/launcher/test_signal_handling.py @@ -12,6 +12,7 @@ from dragon.launcher.launchargs import get_args from dragon.infrastructure.process_desc import ProcessDescriptor +from dragon.infrastructure.node_desc import NodeDescriptor from dragon.infrastructure import facts as dfacts from dragon.infrastructure import messages as dmsg import dragon.utils as du @@ -68,11 +69,12 @@ def send_shchannelsup(nodes, mpool): node['gs_ch'] = None gs_cd = None + node_desc = NodeDescriptor(host_name=node['hostname'], + host_id=host_id, + ip_addrs=node['ip_addrs'], + shep_cd=B64.bytes_to_str(node['ls_ch'].serialize())) ch_up_msg = dmsg.SHChannelsUp(tag=next_tag(), - host_name=node['hostname'], - host_id=host_id, - ip_addrs=node['ip_addrs'], - shep_cd=B64.bytes_to_str(node['ls_ch'].serialize()), + node_desc=node_desc, gs_cd=gs_cd, idx=node['node_index']) log.info(f'construct SHChannelsUp: {ch_up_msg}') @@ -895,6 +897,7 @@ def test_sigint_hung_overlay(self, mock_overlay, mock_be_launch): mock_procs.join() + @unittest.skip("Skipped pending fix of problem outlined in CIRRUS-1922. This test terminates too much and sometimes kills Docker container.") @patch('dragon.launcher.frontend.LauncherFrontEnd._launch_backend') @patch('dragon.launcher.frontend.start_overlay_network') def test_rapid_sigint(self, mock_overlay, mock_be_launch): @@ -949,6 +952,7 @@ def test_rapid_sigint_no_mock(self): else: print("Unable to run. Requires WLM job allocation") + @unittest.skip("Skipped pending fix of problem outlined in CIRRUS-1922. This test terminates too much and sometimes kills Docker container.") @patch('dragon.launcher.frontend.LauncherFrontEnd._launch_backend') @patch('dragon.launcher.frontend.start_overlay_network') def test_teardown_with_hung_backend_sigint(self, mock_overlay, mock_be_launch): diff --git a/test/multi-node/test_array.py b/test/multi-node/test_array.py index e61d1a5..22ae098 100644 --- a/test/multi-node/test_array.py +++ b/test/multi-node/test_array.py @@ -14,7 +14,7 @@ import random -def process_function(event, array, queue, max_val, idx,): +def process_function(event, array, queue, max_val, idx, barrier,): p = mp.current_process() mypuid = p.pid @@ -23,10 +23,11 @@ def process_function(event, array, queue, max_val, idx,): event.wait() array[idx] = 1 + barrier.wait() #while the value is less than max_value while array[idx] < max_val: - with array.get_lock(): + with array.get_lock(): array[idx] += 1 array_list = array[:] array_list.append(random.randint(-sys.maxsize - 1, sys.maxsize)) @@ -71,9 +72,10 @@ def test_lock_fairness(self,): array = mp.Array("i", range(128+nproc), lock=True,) queue = mp.Queue() event = mp.Event() + barrier = mp.Barrier(nproc) for idx in range(nproc): - p = mp.Process(target=process_function,args=(event, array, queue, max_val, idx,),) + p = mp.Process(target=process_function,args=(event, array, queue, max_val, idx, barrier,),) p.start() procs.append(p) puid_dict[p.pid] = 0 @@ -81,22 +83,19 @@ def test_lock_fairness(self,): # parent sets event for children processes to work at same time event.set() - cnt = 0 - - for idx in range(0,max_val,): - puid, val, = (queue.get()) - # check that the array from child process is valid - cnt += 1 - # increment each time child puid returns array - puid_dict[puid] += 1 + for _ in range(1, max_val): + for pidx in range(nproc): + puid, val, = (queue.get(timeout=10)) + # increment each time child puid returns array + puid_dict[puid] += 1 numpy_puids = np.array(list(puid_dict.values())) # print minimum, maximum, mean, and median from puids_dict (puids_max,puids_min,puids_mean,puids_median,) = (np.max(numpy_puids),np.min(numpy_puids),np.mean(numpy_puids),np.median(numpy_puids),) - print(f"Lock acquisition, {max_val/nproc} tries / proc, {nproc} processes:") - print(f" Maximum: {puids_max}, Minimum: {puids_min}, Mean: {puids_mean}, Median: {puids_median}") + print(f"Lock acquisition, {max_val/nproc} tries / proc, {nproc} processes:", flush=True) + print(f" Maximum: {puids_max}, Minimum: {puids_min}, Mean: {puids_mean}, Median: {puids_median}", flush=True) # check that the child process exited cleanly in parent process for p in procs: diff --git a/test/multi-node/test_dict.py b/test/multi-node/test_dict.py index 3bf9d1b..8c0099f 100644 --- a/test/multi-node/test_dict.py +++ b/test/multi-node/test_dict.py @@ -63,7 +63,7 @@ def do_set_ops(_keys, ddict, value_size): def do_del_ops(_keys, ddict): """Each client will delete the given list of keys in the dictionary - + :param _keys: list of all keys to be deleted from the dictionary :type _keys: list :param ddict: dragon distributed dictionary @@ -76,7 +76,7 @@ def do_del_ops(_keys, ddict): def do_get_ops(_keys, ddict): """Each client will retrieve the values for the given list of keys in the dictionary - + :param _keys: list of all keys to be fetched from the dictionary :type _keys: list :param ddict: dragon distributed dictionary @@ -343,6 +343,7 @@ def do_dict_workload(self, num_clients, ddict): for i in range(len(procs)): procs[i].kill() + @unittest.skip('CIRRUS-1820: Hanging') def test_dict_with_stress_load(self): managers_per_node = 1 # 2 Managers per node num_nodes = len(get_list()) # Collect the total number of nodes diff --git a/test/multi-node/test_process_group.py b/test/multi-node/test_process_group.py index 7b16753..228d93b 100644 --- a/test/multi-node/test_process_group.py +++ b/test/multi-node/test_process_group.py @@ -151,6 +151,21 @@ def test_walltime(self): pg.stop() + @unittest.skip(f"CIRRUS-1831: Will fail until PG api is fixed. The TODO comment should also be addressed.") + def test_node_id(self): + pg = ProcessGroup(self.template, 4) + pg.start() + + nodes = node_get_list() + puids = pg.puids + for puid in puids: + gs_info = process_query(puid) + print(gs_info) + # TODO: Increment counter for each node, assert we've got equal distribution of procs + + pg.stop() + + if __name__ == "__main__": unittest.main() diff --git a/test/native/test_process_group.py b/test/native/test_process_group.py index f12e458..c0c4340 100644 --- a/test/native/test_process_group.py +++ b/test/native/test_process_group.py @@ -515,7 +515,6 @@ def test_bad_transitions(self): pg.init() self.assertTrue(pg.status == "Idle") - self.assertRaises(DragonProcessGroupError, pg.kill, signal.SIGTERM) self.assertRaises(DragonProcessGroupError, pg.kill) pg.start() diff --git a/test/release/test_scipy_img_scale.sh b/test/release/test_scipy_img_scale.sh new file mode 100755 index 0000000..0223faa --- /dev/null +++ b/test/release/test_scipy_img_scale.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +#To change the version of Python, please use the following commands: +# To find the version of Python on your system: ls -ls /usr/bin/python* +# To load the version of Python you want: `alias python=python` or `alias python='\usr\bin\python'` +#The work time or compute time is set to 1 second. This determines the execution time of the program. + +# The following timings for the scipy_scale_work.py file are completed using Python 3.11.5 with 8 nodes. The program inputs, the timings for the iterations, the average time and the standard of deviations for the iterations are recorded in the tables below. The tables are grouped by number of iterations. + +# 4 iteratons + +# NUM_WORKERS | NUM_IMAGES | NUM_BURNS | NUM_ITER | SIZE_IMG | MEM_SIZE | WORK TIME | 0TH_ITER(s) | 1ST_ITER(s) | 2ND_ITER(s) | 3RD_ITER(s) | 4TH_ITER(s) | AVG_TIME (S) | STD_DEV (s) +# 1024 | 32768 | 1 | 4 | 32 | 33554432 | 1.00 | 36.53 | 33.71 | 33.75 | 34.13 | 33.77 | 33.84 | 00.17 + +# The following timings for the scipy_scale_work.py file are completed using Python 3.10.12 with 8 nodes. The program inputs, the timings for the iterations, the average time and the standard of deviations for the iterations are recorded in the tables below. The tables are grouped by number of iterations. + +# 4 iteratons + +# NUM_WORKERS | NUM_IMAGES | NUM_BURNS | NUM_ITER | SIZE_IMG | MEM_SIZE | WORK TIME | 0TH_ITER(s) | 1ST_ITER(s) | 2ND_ITER(s) | 3RD_ITER(s) | 4TH_ITER(s) | AVG_TIME (S) | STD_DEV (s) +# 1024 | 32768 | 1 | 4 | 32 | 33554432 | 1.00 | 36.58 | 33.65 | 33.92 | 33.99 | 33.80 | 33.84 | 00.13 + + +# To set up a multi-node environment for deployment of the script, it is recommended to pass the command `salloc --nodes=8 <--exclusive if needed> <-t hh:mm:ss>`. +# The arguments encapuslated by <> are recommended but not needed. + +#The following line of code can be run to allocate the number of nodes needed: +# salloc --nodes=8 --exclusive -t 04:00:00 + +#This runs the script with the parameters of 1024 workers, 32768 images, 1 burn cycle, 4 iterations, images of size 32, memory size of 33554432, and a work time for each image of 1 second. +dragon ../../examples/multiprocessing/numpy-mpi4py-examples/scipy_scale_work.py --dragon --num_workers 1024 --iterations 4 --burns 1 --size 32 --mem 33554432 --work_time 1 + +# #The following lines of code ensure swift clean up. +# #dragon-cleanup +# #scancel -u $USER \ No newline at end of file diff --git a/test/test_integration_shep_gs.py b/test/test_integration_shep_gs.py index 3804150..30cc2af 100755 --- a/test/test_integration_shep_gs.py +++ b/test/test_integration_shep_gs.py @@ -170,9 +170,9 @@ def start_duts(self): sh_msg = tsu.get_and_check_type(self.bela_input_rh, dmsg.SHChannelsUp) self.assertIsInstance(sh_msg, dmsg.SHChannelsUp) - self.assertEqual(dutils.host_id(), sh_msg.host_id) + self.assertEqual(dutils.host_id(), sh_msg.node_desc.host_id) if self.chatty_teardown: - print(f"info from SHChannelsUp: host_id={sh_msg.host_id}, hostname={sh_msg.host_name}") + print(f"info from SHChannelsUp: host_id={sh_msg.node_desc.host_id}, hostname={sh_msg.node_desc.host_name}") sys.stdout.flush() tsu.get_and_check_type(self.bela_input_rh, dmsg.GSIsUp) @@ -206,11 +206,20 @@ def tearDown(self) -> None: self.proc_gs_return_chan.destroy() + # When ref counting works this should detach. self.inf_pool.detach() self.shep_stdin_wh.send(dmsg.BEHalted(tag=self.next_tag()).serialize()) tsu.get_and_check_several_ignore_SHFwdOutput(self, self.shep_stdout_rh, {dmsg.SHHalted: 1}) + + # We can safely destroy and detach for sure once we have receieved SHHalted. If it fails, + # the detach above worked. + try: + self.inf_pool.destroy() + except: + pass + self.shep_stdin_rh.close() self.shep_stdin_wh.close() self.shep_stdout_rh.close() @@ -706,14 +715,15 @@ def start_multi_duts(self): msg = tsu.get_and_check_type(self.names[f'bela_input_rh_{i}'], dmsg.SHChannelsUp) assert isinstance(msg, dmsg.SHChannelsUp) - self.assertEqual(dutils.host_id(), msg.host_id) + self.assertEqual(dutils.host_id(), msg.node_desc.host_id) if self.chatty_teardown: print(f'got SHChannelsUp from LS {i}', flush=True) - print(f'Info from SHChannelsUp: host_id={msg.host_id}, hostname={msg.host_name}', flush=True) + print(f'Info from SHChannelsUp: host_id={msg.node_desc.host_id}, hostname={msg.node_desc.host_name}', flush=True) chs_up.append(msg) - la_ch_info = dmsg.LAChannelsInfo(tag=self.next_tag(), nodes_desc=chs_up, + nodes_desc = {ch_up.idx: ch_up.node_desc for ch_up in chs_up} + la_ch_info = dmsg.LAChannelsInfo(tag=self.next_tag(), nodes_desc=nodes_desc, gs_cd=self.gs_cd, num_gw_channels=0) for i in range(self.nls): # communicate gs_cd to all local services process so that they diff --git a/test/test_shepherd.py b/test/test_shepherd.py index e6e1a5d..88998e0 100755 --- a/test/test_shepherd.py +++ b/test/test_shepherd.py @@ -60,14 +60,6 @@ def tearDown(self) -> None: self.shep_stdout_rh.close() self.shep_stdin_wh.close() - self.shep_ch.detach() - self.be_ch.detach() - self.gs_ch.detach() - - if ATTACH_POOLS: - self.def_pool.detach() - self.inf_pool.detach() - def next_tag(self): tmp = self.tag self.tag += 1 @@ -91,8 +83,6 @@ def do_bringup(self): inf_pd = du.B64.str_to_bytes(msg.inf_pd) self.inf_pool = dmm.MemoryPool.attach(inf_pd) - else: - print(f'shep_cd: {msg.shep_cd}') shep_cd = du.B64.str_to_bytes(msg.shep_cd) self.shep_ch = dch.Channel.attach(shep_cd) @@ -138,8 +128,35 @@ def do_teardown(self): tsu.get_and_check_type(self.shep_stdout_rh, dmsg.SHHalted) self.proc.join() - if not ATTACH_POOLS: - time.sleep(10) + + # In place of the next three destroy calls we could have called + # detach, but this demonstrates that we can call destroy twice + # on a channel and it will work. Local services destroyed first, + # and then after the join above we know we can destroy here as well. + # The advantage of calling destroy here is that since we know we are + # done with the channel, destroy will ignore any ref counting and clean + # it up. + try: + self.gs_ch.destroy() + except: + pass + + try: + self.be_ch.destroy() + except: + pass + + try: + self.shep_ch.destroy() + except: + pass + + if ATTACH_POOLS: + # Calling destroy here is OK because we have joined on exit of local services + # which already destoyed the pools. Calling destroy here forcefully detaches + # from the pool (regardless of ref counting). + self.def_pool.destroy() + self.inf_pool.destroy() def do_abnormal_teardown(self): # when the BE receives AbnormalTermination, then the SHTeardown proceeds as usual @@ -152,6 +169,29 @@ def do_abnormal_teardown(self): self.proc.join() + try: + self.gs_ch.detach() + except: + pass + + try: + self.be_ch.detach() + except: + pass + + try: + self.shep_ch.detach() + except: + pass + + if ATTACH_POOLS: + # Calling destroy here is OK because we have joined on exit of local services + # which already destoyed the pools. Calling destroy here forcefully detaches + # from the pool (regardless of ref counting). + self.def_pool.destroy() + self.inf_pool.destroy() + + def _make_pool(self, tag, p_uid, r_c_uid, size, m_uid, name): self.shep_main_wh.send( dmsg.SHPoolCreate(tag=tag, p_uid=p_uid, r_c_uid=r_c_uid, size=size, @@ -171,11 +211,6 @@ def test_bringup_teardown(self): self.do_bringup() self.do_teardown() - # def test_bringup_teardown2(self): - # # for diagnosing pool lifecycle issues - # self.do_bringup() - # self.do_teardown() - def test_process_fwdoutput(self): self.do_bringup() target_puid = 17777 diff --git a/test/transport/tcp/test_address.py b/test/transport/tcp/test_address.py index 57fe9a5..b088a8e 100644 --- a/test/transport/tcp/test_address.py +++ b/test/transport/tcp/test_address.py @@ -10,7 +10,7 @@ class AddressIPv4TestCase(unittest.IsolatedAsyncioTestCase): IP = ip_address('127.0.0.1') - NETLOC = ['127.0.0.1', '[127.0.0.1]'] + NETLOC = ['127.0.0.1', '127.0.0.1'] LOOPBACK = transport.LOOPBACK_ADDRESS_IPv4 diff --git a/test/transport/test_lsif.py b/test/transport/test_lsif.py index e9b679d..40388fd 100644 --- a/test/transport/test_lsif.py +++ b/test/transport/test_lsif.py @@ -9,6 +9,7 @@ from dragon.infrastructure import facts as dfacts from dragon.infrastructure import messages as dmsg from dragon.infrastructure.connection import Connection +from dragon.infrastructure.node_desc import NodeDescriptor from dragon.launcher.util import next_tag from dragon.managed_memory import MemoryPool from dragon.transport import start_transport_agent @@ -72,10 +73,19 @@ def setUp(self): LOGGER.info('Memory pools and channels created') + # Use the nodes dict to construct a dictionary of NodeDescriptor objects + nodes_desc = { + i: NodeDescriptor(ip_addrs=node['ip_addrs'], + host_id=node['host_id'], + shep_cd=node['shep_cd'], + host_name=node['host_name']) + for i, node in self.nodes.items() + } + # Create LAChannelsInfo la_ch_info = dmsg.LAChannelsInfo( tag=next_tag(), - nodes_desc=self.nodes, + nodes_desc=nodes_desc, gs_cd='', num_gw_channels=NUM_GW_CHANNELS_PER_NODE, ) diff --git a/test/utils/test_attach.c b/test/utils/test_attach.c index 1731abf..4abb151 100644 --- a/test/utils/test_attach.c +++ b/test/utils/test_attach.c @@ -116,13 +116,6 @@ int main(int argc, char **argv) main_err_fail(derr, "Failed to detach from pool", jmp_destroy_pool); } - /* FIXME: This is temporary while PE-43816 is being worked on. */ - printf("This is a temporary fix to remove pool until PE-43816 is completed.\n"); - derr = dragon_memory_pool_destroy(&mpool); - if (derr != DRAGON_SUCCESS) { - err_fail(derr, "Failed to destroy the memory pool"); - } - return SUCCESS; /******** Check our channel attach/detach before our final pool detach ********/ diff --git a/test/utils/test_basic_mempool.py b/test/utils/test_basic_mempool.py index 6505291..4d7a079 100644 --- a/test/utils/test_basic_mempool.py +++ b/test/utils/test_basic_mempool.py @@ -26,7 +26,7 @@ def mk_remote_mem_ser(local_mem_ser): mem_ser_array = bytearray(local_mem_ser) mem_ser_array[8] = 99 # change m_uid - mem_ser_array[16] = 99 # change hostid + mem_ser_array[16] = mem_ser_array[16] ^ 99 # change hostid mem_ser_remote = bytes(mem_ser_array) return mem_ser_remote @@ -39,7 +39,7 @@ def mk_remote_mem_ser(local_mem_ser): def mk_remote_pool_ser(local_pool_ser): pool_ser_array = bytearray(local_pool_ser) pool_ser_array[0] = 99 # change m_uid - pool_ser_array[8] = 99 # change hostid + pool_ser_array[8] = pool_ser_array[8] ^ 99 # change hostid pool_ser_remote = bytes(pool_ser_array) return pool_ser_remote