diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index bb86ae537f..03b65a3b86 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3,11 +3,14 @@ name: CI
on:
push:
- branches: [ devel ]
+ branches:
+ - 'devel*'
pull_request:
- branches: [ devel ]
+ branches:
+ - 'devel*'
jobs:
+
linting:
runs-on: ubuntu-latest
steps:
@@ -22,10 +25,7 @@ jobs:
python -m venv testenv
. testenv/bin/activate
python -m pip install --upgrade pip setuptools wheel
- python -m pip install git+https://github.com/radical-cybertools/radical.utils.git@devel
- python -m pip install git+https://github.com/radical-cybertools/radical.saga.git@devel
- python -m pip install --upgrade .
- python -m pip install -r requirements-tests.txt
+ python -m pip install -r requirements-ci.txt
- name: Lint with flake8 and pylint
run: |
. testenv/bin/activate
@@ -41,22 +41,8 @@ jobs:
flake8 $FILTERED
pylint $FILTERED
- notebook_integration_test:
- strategy:
- matrix:
- python-version: [ '3.7', '3.8', '3.9', '3.10', '3.11' ]
- uses: ./.github/workflows/run-rp-notebook.yml
- with:
- python-version: ${{ matrix.python-version }}
- notebook-name: 'getting_started.ipynb'
-
tests:
runs-on: ${{ matrix.os }}
- services:
- mongodb:
- image: mongo
- ports:
- - 27017/tcp
strategy:
matrix:
os: [ ubuntu-latest ]
@@ -65,53 +51,40 @@ jobs:
- os: ubuntu-20.04
python-version: '3.6'
steps:
- - uses: actions/checkout@v3
- with:
- fetch-depth: 2
- - uses: actions/setup-python@v4
- with:
- python-version: ${{ matrix.python-version }}
- - name: Install dependencies
- run: |
- python -m venv testenv
- . testenv/bin/activate
- python -m pip install --upgrade pip setuptools wheel
- python -m pip install git+https://github.com/radical-cybertools/radical.utils.git@devel
- python -m pip install git+https://github.com/radical-cybertools/radical.saga.git@devel
- python -m pip install --upgrade .
- python -m pip install -r requirements-tests.txt
- - name: Unit tests
- env:
- RADICAL_PILOT_DBURL: mongodb://localhost:${{ job.services.mongodb.ports[27017] }}/test
- run: |
- . testenv/bin/activate
- coverage run --source=radical.pilot -m pytest -ra -vvv --showlocals tests/unit_tests/ tests/component_tests/
- coverage xml
- - uses: codecov/codecov-action@v3
- if: ${{ matrix.python-version == '3.7' }}
- with:
- files: ./coverage.xml
- # - name: Integration test
- # env:
- # MONGODB_HOST: localhost
- # MONGODB_PORT: ${{ job.services.mongodb.ports[27017] }}
- # RADICAL_PILOT_DBURL: mongodb://localhost:${{ job.services.mongodb.ports[27017] }}/test
- # TARGET_PATH: 'docs/source/getting_started.ipynb'
- # run: |
- # . testenv/bin/activate
- # python -m pip install -r requirements-docs.txt
- # python -m pip install jupyter
- # jupyter nbconvert --clear-output --inplace $TARGET_PATH
- # jupyter nbconvert --to notebook --execute --inplace $TARGET_PATH
+ - uses: actions/checkout@v3
+ with:
+ fetch-depth: 2
+ - uses: actions/setup-python@v4
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ python -m venv testenv
+ . testenv/bin/activate
+ python -m pip install --upgrade pip setuptools wheel
+ python -m pip install -r requirements-ci.txt
+ - name: Unit tests
+ run: |
+ . testenv/bin/activate
+ coverage run --source=radical.pilot -m pytest -ra -vvv --showlocals tests/unit_tests/ tests/component_tests/
+ coverage xml
+ - uses: codecov/codecov-action@v3
+ if: ${{ matrix.python-version == '3.7' }}
+ with:
+ files: ./coverage.xml
+ notebook_integration_test:
+ strategy:
+ matrix:
+ python-version: [ '3.7', '3.8', '3.9', '3.10', '3.11' ]
+ uses: ./.github/workflows/run-rp-notebook.yml
+ with:
+ python-version: ${{ matrix.python-version }}
+ requirements-file: 'requirements-docs-ci.txt'
+ notebook-name: 'getting_started.ipynb'
analytics:
runs-on: ${{ matrix.os }}
- services:
- mongodb:
- image: mongo
- ports:
- - 27017/tcp
strategy:
matrix:
os: [ ubuntu-latest ]
@@ -120,41 +93,36 @@ jobs:
- os: ubuntu-20.04
python-version: '3.6'
steps:
- - uses: actions/checkout@v3
- with:
- fetch-depth: 2
- - uses: actions/setup-python@v4
- with:
- python-version: ${{ matrix.python-version }}
- - name: Install dependencies
- run: |
- sudo apt update
- sudo apt install -y texlive cm-super
- sudo apt install -y texlive-fonts-extra texlive-extra-utils dvipng
- sudo apt install -y texlive-fonts-recommended texlive-latex-extra
- python -m venv testenv
- . testenv/bin/activate
- python -m pip install --upgrade pip setuptools wheel
- python -m pip install git+https://github.com/radical-cybertools/radical.utils.git@devel
- python -m pip install git+https://github.com/radical-cybertools/radical.saga.git@devel
- python -m pip install --upgrade .
- python -m pip install git+https://github.com/radical-cybertools/radical.analytics.git@devel
- - name: analyze example session
- timeout-minutes: 5
- env:
- RADICAL_PILOT_DBURL: mongodb://localhost:${{ job.services.mongodb.ports[27017] }}/test
- run: |
- . testenv/bin/activate
- ./examples/00_getting_started.py local.localhost
- SID=$(ls -rt | grep rp.session)
- echo "$SID: $SID"
- radical-analytics-inspect "$SID"
- mkdir artifacts
- ls -la
- cp -R *.png *.stats artifacts
- - name: upload artifacts
- uses: actions/upload-artifact@v3
- with:
- name: artifacts
- path: artifacts
+ - uses: actions/checkout@v3
+ with:
+ fetch-depth: 2
+ - uses: actions/setup-python@v4
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ sudo apt update
+ sudo apt install -y texlive cm-super
+ sudo apt install -y texlive-fonts-extra texlive-extra-utils dvipng
+ sudo apt install -y texlive-fonts-recommended texlive-latex-extra
+ python -m venv testenv
+ . testenv/bin/activate
+ python -m pip install --upgrade pip setuptools wheel
+ python -m pip install -r requirements-ci.txt
+ - name: analyze example session
+ timeout-minutes: 5
+ run: |
+ . testenv/bin/activate
+ ./examples/00_getting_started.py local.localhost
+ SID=$(ls -rt | grep rp.session)
+ echo "$SID: $SID"
+ radical-analytics-inspect "$SID"
+ mkdir artifacts
+ ls -la
+ cp -R *.png *.stats artifacts
+ - name: upload artifacts
+ uses: actions/upload-artifact@v3
+ with:
+ name: artifacts
+ path: artifacts
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index ec443e7c37..54bfe44976 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -9,10 +9,10 @@ name: 'Test Jupyter notebooks'
on:
push:
branches:
- - docs/nb_section3
+ - 'devel*'
pull_request:
branches:
- - docs/nb_section3
+ - 'devel*'
# This allows a subsequently queued workflow run to interrupt previous runs
concurrency:
@@ -24,6 +24,7 @@ jobs:
uses: ./.github/workflows/run-rp-notebook.yml
with:
python-version: 3.7
+ requirements-file: 'requirements-docs-ci.txt'
notebook-name: 'getting_started.ipynb'
test-tutorials:
@@ -31,16 +32,18 @@ jobs:
matrix:
tutorial: [
'configuration.ipynb',
- # 'debugging.ipynb',
+ 'debugging.ipynb',
'describing_tasks.ipynb',
'multiple_pilots.ipynb',
- # 'profiling.ipynb',
- # 'raptor.ipynb',
+ 'profiling.ipynb',
+ 'raptor.ipynb',
'staging_data.ipynb',
'submission.ipynb'
]
uses: ./.github/workflows/run-rp-notebook.yml
with:
python-version: 3.7
+ requirements-file: 'requirements-docs-ci.txt'
notebook-name: ${{ matrix.tutorial }}
- notebook-path: 'tutorials'
\ No newline at end of file
+ notebook-path: 'tutorials'
+
diff --git a/.github/workflows/run-rp-notebook.yml b/.github/workflows/run-rp-notebook.yml
index de5b9a2251..57de1dab78 100644
--- a/.github/workflows/run-rp-notebook.yml
+++ b/.github/workflows/run-rp-notebook.yml
@@ -7,6 +7,11 @@ on:
description: 'Python version for running the Jupyter notebook'
required: true
type: string
+ requirements-file:
+ description: 'File with dependencies'
+ required: false
+ default: 'requirements-docs.txt'
+ type: string
notebook-name:
description: 'File name of the Jupyter notebook'
required: true
@@ -25,11 +30,6 @@ on:
jobs:
tests:
runs-on: ubuntu-latest
- services:
- mongodb:
- image: mongo
- ports:
- - 27017/tcp # will assign a random free host port
steps:
- name: Checkout repository
uses: actions/checkout@v3
@@ -41,18 +41,39 @@ jobs:
python-version: ${{ inputs.python-version }}
- name: Install dependencies
run: |
+ sudo apt update -y && sudo apt install -y mpich
python -m venv testenv
. testenv/bin/activate
- python -m pip install --upgrade pip
- python -m pip install -r requirements-docs.txt
+ python -m pip install --upgrade pip setuptools wheel
+ python -m pip install -r ${{ inputs.requirements-file }}
python -m pip install jupyter
- name: Run Jupyter Notebook
env:
- MONGODB_HOST: localhost
- MONGODB_PORT: ${{ job.services.mongodb.ports[27017] }}
- RADICAL_PILOT_DBURL: mongodb://localhost:${{ job.services.mongodb.ports[27017] }}/test
TARGET_PATH: ${{ format('{0}/{1}/{2}', inputs.documentation-path, inputs.notebook-path, inputs.notebook-name) }}
+ timeout-minutes: 5
+ # continue-on-error: true
run: |
. testenv/bin/activate
jupyter nbconvert --clear-output --inplace $TARGET_PATH
jupyter nbconvert --to notebook --execute --inplace $TARGET_PATH
+ - name: Collect session
+ if: always()
+ run: |
+ SIDCLIENT=$(ls -rt | grep rp.session)
+ SIDAGENT="$HOME/radical.pilot.sandbox/$SIDCLIENT"
+ CLIENTNAME="${{inputs.notebook-name}}_client_$SIDCLIENT"
+ AGENTNAME="${{inputs.notebook-name}}_agent_$SIDCLIENT"
+ mkdir session
+ tar cvfj $CLIENTNAME.tar.bz2 $SIDCLIENT
+ cp -R $CLIENTNAME.tar.bz2 session
+ if [ -d "$SIDAGENT" ]; then
+ tar cvfj $AGENTNAME.tar.bz2 $SIDAGENT
+ cp -R $AGENTNAME.tar.bz2 session
+ fi
+ - name: upload session
+ if: always()
+ uses: actions/upload-artifact@v3
+ with:
+ name: session
+ path: session
+ retention-days: 5
diff --git a/.readthedocs.yml b/.readthedocs.yml
index 855af75dfb..cb344a59a7 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -3,9 +3,14 @@ version: 2
formats: [htmlzip]
+build:
+ os: "ubuntu-22.04"
+ tools:
+ python: "3.7"
+ apt_packages:
+ - mpich
+
python:
- system_packages: true
- version: 3.7
install:
- requirements: requirements-docs.txt
- method: pip
diff --git a/TODO b/TODO
index 10770e1cc2..746d054f3c 100644
--- a/TODO
+++ b/TODO
@@ -252,9 +252,14 @@ term iv
- Exception in SA startup causes hang (agent_0 does not die)
+others
+------
- `PRTE` switches in scheduler should become `partition` switches, where the
partitions are defined by the RM
+ - stager and other RPC like workers should get a proper async RPC channel
+ (req/res). That should be built upon `ru.zmq.Service`. The client side
+ should wrap the request into a proper async Future.
- create_named_env needs to issue a client error on failures
diff --git a/VERSION b/VERSION
index ebeef2f2d6..32b7211cb6 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.38.0
+1.40.0
diff --git a/bin/plot_profile.ipynb b/bin/plot_profile.ipynb
index 1f12684c79..ef04d2e299 100755
--- a/bin/plot_profile.ipynb
+++ b/bin/plot_profile.ipynb
@@ -89,7 +89,7 @@
"\n",
"Number of pilots in session: 1\n",
"Processing pilot 'pilot.0000'\n",
- "fetching 'file://localhost/home/merzky/radical.pilot.sandbox/rp.session.cameo.merzky.016673.0005-pilot.0000//agent.0.AgentExecutingComponent_SHELL.0.prof' to 'file://localhost/tmp//rp.session.cameo.merzky.016673.0005/'."
+ "fetching 'file://localhost/home/merzky/radical.pilot.sandbox/rp.session.cameo.merzky.016673.0005-pilot.0000//agent_0.AgentExecutingComponent_SHELL.0.prof' to 'file://localhost/tmp//rp.session.cameo.merzky.016673.0005/'."
]
},
{
@@ -97,7 +97,7 @@
"stream": "stdout",
"text": [
"\n",
- "fetching 'file://localhost/home/merzky/radical.pilot.sandbox/rp.session.cameo.merzky.016673.0005-pilot.0000//agent.0.AgentExecutingWatcher_SHELL.0.prof' to 'file://localhost/tmp//rp.session.cameo.merzky.016673.0005/'."
+ "fetching 'file://localhost/home/merzky/radical.pilot.sandbox/rp.session.cameo.merzky.016673.0005-pilot.0000//agent_0.AgentExecutingWatcher_SHELL.0.prof' to 'file://localhost/tmp//rp.session.cameo.merzky.016673.0005/'."
]
},
{
@@ -105,7 +105,7 @@
"stream": "stdout",
"text": [
"\n",
- "fetching 'file://localhost/home/merzky/radical.pilot.sandbox/rp.session.cameo.merzky.016673.0005-pilot.0000//agent.0.AgentHeartbeatWorker.0.prof' to 'file://localhost/tmp//rp.session.cameo.merzky.016673.0005/'."
+ "fetching 'file://localhost/home/merzky/radical.pilot.sandbox/rp.session.cameo.merzky.016673.0005-pilot.0000//agent_0.AgentHeartbeatWorker.0.prof' to 'file://localhost/tmp//rp.session.cameo.merzky.016673.0005/'."
]
},
{
@@ -113,7 +113,7 @@
"stream": "stdout",
"text": [
"\n",
- "fetching 'file://localhost/home/merzky/radical.pilot.sandbox/rp.session.cameo.merzky.016673.0005-pilot.0000//agent.0.AgentStagingInputComponent.0.prof' to 'file://localhost/tmp//rp.session.cameo.merzky.016673.0005/'."
+ "fetching 'file://localhost/home/merzky/radical.pilot.sandbox/rp.session.cameo.merzky.016673.0005-pilot.0000//agent_0.AgentStagingInputComponent.0.prof' to 'file://localhost/tmp//rp.session.cameo.merzky.016673.0005/'."
]
},
{
@@ -121,7 +121,7 @@
"stream": "stdout",
"text": [
"\n",
- "fetching 'file://localhost/home/merzky/radical.pilot.sandbox/rp.session.cameo.merzky.016673.0005-pilot.0000//agent.0.AgentStagingOutputComponent.0.prof' to 'file://localhost/tmp//rp.session.cameo.merzky.016673.0005/'."
+ "fetching 'file://localhost/home/merzky/radical.pilot.sandbox/rp.session.cameo.merzky.016673.0005-pilot.0000//agent_0.AgentStagingOutputComponent.0.prof' to 'file://localhost/tmp//rp.session.cameo.merzky.016673.0005/'."
]
},
{
@@ -129,7 +129,7 @@
"stream": "stdout",
"text": [
"\n",
- "fetching 'file://localhost/home/merzky/radical.pilot.sandbox/rp.session.cameo.merzky.016673.0005-pilot.0000//agent.0.AgentUpdateWorker.0.prof' to 'file://localhost/tmp//rp.session.cameo.merzky.016673.0005/'."
+ "fetching 'file://localhost/home/merzky/radical.pilot.sandbox/rp.session.cameo.merzky.016673.0005-pilot.0000//agent_0.AgentUpdateWorker.0.prof' to 'file://localhost/tmp//rp.session.cameo.merzky.016673.0005/'."
]
},
{
@@ -137,7 +137,7 @@
"stream": "stdout",
"text": [
"\n",
- "fetching 'file://localhost/home/merzky/radical.pilot.sandbox/rp.session.cameo.merzky.016673.0005-pilot.0000//agent.0.AgentWorker.0.prof' to 'file://localhost/tmp//rp.session.cameo.merzky.016673.0005/'."
+ "fetching 'file://localhost/home/merzky/radical.pilot.sandbox/rp.session.cameo.merzky.016673.0005-pilot.0000//agent_0.AgentWorker.0.prof' to 'file://localhost/tmp//rp.session.cameo.merzky.016673.0005/'."
]
},
{
@@ -145,7 +145,7 @@
"stream": "stdout",
"text": [
"\n",
- "fetching 'file://localhost/home/merzky/radical.pilot.sandbox/rp.session.cameo.merzky.016673.0005-pilot.0000//agent.0.SchedulerContinuous.0.prof' to 'file://localhost/tmp//rp.session.cameo.merzky.016673.0005/'."
+ "fetching 'file://localhost/home/merzky/radical.pilot.sandbox/rp.session.cameo.merzky.016673.0005-pilot.0000//agent_0.SchedulerContinuous.0.prof' to 'file://localhost/tmp//rp.session.cameo.merzky.016673.0005/'."
]
},
{
@@ -552,15 +552,15 @@
"168 13.0622 New PendingInputStaging tmgr_adv_u_pend rp.session.cameo.merzky.016673.0005:MainThread\n",
"169 13.2733 PendingInputStaging StagingInput usic_get_u rp.session.cameo.merzky.016673.0005:InputFileT...\n",
"175 13.6307 StagingInput AgentStagingInputPending usic_adv_u_pend rp.session.cameo.merzky.016673.0005:InputFileT...\n",
- "209 16.0061 AgentStagingInputPending AgentStagingInputPending awo_adv_u_pend agent.0.AgentWorker.0:MainThread\n",
- "293 16.0258 AgentStagingInputPending AgentStagingInput asic_adv_u agent.0.AgentStagingInputComponent.0:MainThread\n",
- "297 16.0261 AgentStagingInput AllocatingPending asic_adv_u_pend agent.0.AgentStagingInputComponent.0:MainThread\n",
- "368 16.0342 AllocatingPending Allocating asc_adv_u agent.0.SchedulerContinuous.0:MainThread\n",
- "374 16.0347 Allocating ExecutingPending asc_adv_u_pend agent.0.SchedulerContinuous.0:MainThread\n",
- "553 16.5038 ExecutingPending Executing aec_adv_u agent.0.AgentExecutingWatcher_SHELL.0:MainThread\n",
- "693 16.6241 Executing AgentStagingOutputPending aec_adv_u_pend agent.0.AgentExecutingWatcher_SHELL.0:Watcher\n",
- "706 16.6323 AgentStagingOutputPending AgentStagingOutput asoc_adv_u agent.0.AgentStagingOutputComponent.0:MainThread\n",
- "708 16.6326 AgentStagingOutput PendingOutputStaging asoc_adv_u_pend agent.0.AgentStagingOutputComponent.0:MainThread\n",
+ "209 16.0061 AgentStagingInputPending AgentStagingInputPending awo_adv_u_pend agent_0.AgentWorker.0:MainThread\n",
+ "293 16.0258 AgentStagingInputPending AgentStagingInput asic_adv_u agent_0.AgentStagingInputComponent.0:MainThread\n",
+ "297 16.0261 AgentStagingInput AllocatingPending asic_adv_u_pend agent_0.AgentStagingInputComponent.0:MainThread\n",
+ "368 16.0342 AllocatingPending Allocating asc_adv_u agent_0.SchedulerContinuous.0:MainThread\n",
+ "374 16.0347 Allocating ExecutingPending asc_adv_u_pend agent_0.SchedulerContinuous.0:MainThread\n",
+ "553 16.5038 ExecutingPending Executing aec_adv_u agent_0.AgentExecutingWatcher_SHELL.0:MainThread\n",
+ "693 16.6241 Executing AgentStagingOutputPending aec_adv_u_pend agent_0.AgentExecutingWatcher_SHELL.0:Watcher\n",
+ "706 16.6323 AgentStagingOutputPending AgentStagingOutput asoc_adv_u agent_0.AgentStagingOutputComponent.0:MainThread\n",
+ "708 16.6326 AgentStagingOutput PendingOutputStaging asoc_adv_u_pend agent_0.AgentStagingOutputComponent.0:MainThread\n",
"1553 19.0440 PendingOutputStaging StagingOutput usoc_get_u rp.session.cameo.merzky.016673.0005:OutputFile...\n",
"1688 19.2452 StagingOutput Done usoc_adv_u rp.session.cameo.merzky.016673.0005:OutputFile...\n"
]
@@ -589,7 +589,7 @@
"
2167 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 20.5973 | \n",
" task.000070 | \n",
@@ -602,7 +602,7 @@
" 2186 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
" Executing | \n",
" 20.7117 | \n",
" task.000069 | \n",
@@ -615,7 +615,7 @@
" 2193 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 20.7125 | \n",
" task.000010 | \n",
@@ -628,7 +628,7 @@
" 2215 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
" Executing | \n",
" 20.8313 | \n",
" task.000009 | \n",
@@ -641,7 +641,7 @@
" 2222 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 20.8325 | \n",
" task.000069 | \n",
@@ -654,7 +654,7 @@
" 2362 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 20.9513 | \n",
" task.000009 | \n",
@@ -667,7 +667,7 @@
" 2398 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
" Executing | \n",
" 21.0580 | \n",
" task.000071 | \n",
@@ -680,7 +680,7 @@
" 2470 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 21.1808 | \n",
" task.000071 | \n",
@@ -693,7 +693,7 @@
" 2475 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
" Executing | \n",
" 21.1822 | \n",
" task.000011 | \n",
@@ -706,7 +706,7 @@
" 2500 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 21.2993 | \n",
" task.000011 | \n",
@@ -719,7 +719,7 @@
" 2538 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
" Executing | \n",
" 21.7797 | \n",
" task.000088 | \n",
@@ -732,7 +732,7 @@
" 2632 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
" Executing | \n",
" 21.8973 | \n",
" task.000072 | \n",
@@ -745,7 +745,7 @@
" 2638 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 21.8984 | \n",
" task.000088 | \n",
@@ -758,7 +758,7 @@
" 2769 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
" Executing | \n",
" 22.0198 | \n",
" task.000089 | \n",
@@ -771,7 +771,7 @@
" 2776 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 22.0206 | \n",
" task.000072 | \n",
@@ -784,7 +784,7 @@
" 2802 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
" Executing | \n",
" 22.1400 | \n",
" task.000012 | \n",
@@ -797,7 +797,7 @@
" 2809 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 22.1415 | \n",
" task.000089 | \n",
@@ -810,7 +810,7 @@
" 2840 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 22.2618 | \n",
" task.000012 | \n",
@@ -823,7 +823,7 @@
" 2871 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
" Executing | \n",
" 22.8924 | \n",
" task.000020 | \n",
@@ -836,7 +836,7 @@
" 2922 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 23.0136 | \n",
" task.000020 | \n",
@@ -853,26 +853,26 @@
"prompt_number": 45,
"text": [
" event msg name state time uid entity cloned state_from info\n",
- "2167 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.5973 task.000070 task False ExecutingPending aec_adv_u_pend\n",
- "2186 advance agent.0.AgentExecutingWatcher_SHELL.0:MainThread Executing 20.7117 task.000069 task False ExecutingPending aec_adv_u\n",
- "2193 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.7125 task.000010 task False Executing aec_adv_u_pend\n",
- "2215 advance agent.0.AgentExecutingWatcher_SHELL.0:MainThread Executing 20.8313 task.000009 task False ExecutingPending aec_adv_u\n",
- "2222 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.8325 task.000069 task False Executing aec_adv_u_pend\n",
- "2362 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.9513 task.000009 task False Executing aec_adv_u_pend\n",
- "2398 advance agent.0.AgentExecutingWatcher_SHELL.0:MainThread Executing 21.0580 task.000071 task False AgentStagingInputPending aec_adv_u\n",
- "2470 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 21.1808 task.000071 task False ExecutingPending aec_adv_u_pend\n",
- "2475 advance agent.0.AgentExecutingWatcher_SHELL.0:MainThread Executing 21.1822 task.000011 task False ExecutingPending aec_adv_u\n",
- "2500 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 21.2993 task.000011 task False Executing aec_adv_u_pend\n",
- "2538 advance agent.0.AgentExecutingWatcher_SHELL.0:MainThread Executing 21.7797 task.000088 task False AgentStagingInputPending aec_adv_u\n",
- "2632 advance agent.0.AgentExecutingWatcher_SHELL.0:MainThread Executing 21.8973 task.000072 task False ExecutingPending aec_adv_u\n",
- "2638 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 21.8984 task.000088 task False ExecutingPending aec_adv_u_pend\n",
- "2769 advance agent.0.AgentExecutingWatcher_SHELL.0:MainThread Executing 22.0198 task.000089 task False ExecutingPending aec_adv_u\n",
- "2776 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 22.0206 task.000072 task False Executing aec_adv_u_pend\n",
- "2802 advance agent.0.AgentExecutingWatcher_SHELL.0:MainThread Executing 22.1400 task.000012 task False ExecutingPending aec_adv_u\n",
- "2809 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 22.1415 task.000089 task False Executing aec_adv_u_pend\n",
- "2840 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 22.2618 task.000012 task False Executing aec_adv_u_pend\n",
- "2871 advance agent.0.AgentExecutingWatcher_SHELL.0:MainThread Executing 22.8924 task.000020 task False AgentStagingInputPending aec_adv_u\n",
- "2922 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 23.0136 task.000020 task False ExecutingPending aec_adv_u_pend"
+ "2167 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.5973 task.000070 task False ExecutingPending aec_adv_u_pend\n",
+ "2186 advance agent_0.AgentExecutingWatcher_SHELL.0:MainThread Executing 20.7117 task.000069 task False ExecutingPending aec_adv_u\n",
+ "2193 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.7125 task.000010 task False Executing aec_adv_u_pend\n",
+ "2215 advance agent_0.AgentExecutingWatcher_SHELL.0:MainThread Executing 20.8313 task.000009 task False ExecutingPending aec_adv_u\n",
+ "2222 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.8325 task.000069 task False Executing aec_adv_u_pend\n",
+ "2362 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.9513 task.000009 task False Executing aec_adv_u_pend\n",
+ "2398 advance agent_0.AgentExecutingWatcher_SHELL.0:MainThread Executing 21.0580 task.000071 task False AgentStagingInputPending aec_adv_u\n",
+ "2470 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 21.1808 task.000071 task False ExecutingPending aec_adv_u_pend\n",
+ "2475 advance agent_0.AgentExecutingWatcher_SHELL.0:MainThread Executing 21.1822 task.000011 task False ExecutingPending aec_adv_u\n",
+ "2500 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 21.2993 task.000011 task False Executing aec_adv_u_pend\n",
+ "2538 advance agent_0.AgentExecutingWatcher_SHELL.0:MainThread Executing 21.7797 task.000088 task False AgentStagingInputPending aec_adv_u\n",
+ "2632 advance agent_0.AgentExecutingWatcher_SHELL.0:MainThread Executing 21.8973 task.000072 task False ExecutingPending aec_adv_u\n",
+ "2638 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 21.8984 task.000088 task False ExecutingPending aec_adv_u_pend\n",
+ "2769 advance agent_0.AgentExecutingWatcher_SHELL.0:MainThread Executing 22.0198 task.000089 task False ExecutingPending aec_adv_u\n",
+ "2776 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 22.0206 task.000072 task False Executing aec_adv_u_pend\n",
+ "2802 advance agent_0.AgentExecutingWatcher_SHELL.0:MainThread Executing 22.1400 task.000012 task False ExecutingPending aec_adv_u\n",
+ "2809 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 22.1415 task.000089 task False Executing aec_adv_u_pend\n",
+ "2840 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 22.2618 task.000012 task False Executing aec_adv_u_pend\n",
+ "2871 advance agent_0.AgentExecutingWatcher_SHELL.0:MainThread Executing 22.8924 task.000020 task False AgentStagingInputPending aec_adv_u\n",
+ "2922 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 23.0136 task.000020 task False ExecutingPending aec_adv_u_pend"
]
}
],
@@ -913,7 +913,7 @@
" 2167 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 20.5973 | \n",
" task.000070 | \n",
@@ -926,7 +926,7 @@
" 2186 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
" Executing | \n",
" 20.7117 | \n",
" task.000069 | \n",
@@ -939,7 +939,7 @@
" 2193 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 20.7125 | \n",
" task.000010 | \n",
@@ -952,7 +952,7 @@
" 2215 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
" Executing | \n",
" 20.8313 | \n",
" task.000009 | \n",
@@ -965,7 +965,7 @@
" 2222 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 20.8325 | \n",
" task.000069 | \n",
@@ -978,7 +978,7 @@
" 2362 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 20.9513 | \n",
" task.000009 | \n",
@@ -991,7 +991,7 @@
" 2398 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
" Executing | \n",
" 21.0580 | \n",
" task.000071 | \n",
@@ -1004,7 +1004,7 @@
" 2470 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 21.1808 | \n",
" task.000071 | \n",
@@ -1017,7 +1017,7 @@
" 2475 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
" Executing | \n",
" 21.1822 | \n",
" task.000011 | \n",
@@ -1030,7 +1030,7 @@
" 2500 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 21.2993 | \n",
" task.000011 | \n",
@@ -1047,16 +1047,16 @@
"prompt_number": 48,
"text": [
" event msg name state time uid entity cloned state_from info\n",
- "2167 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.5973 task.000070 task False ExecutingPending aec_adv_u_pend\n",
- "2186 advance agent.0.AgentExecutingWatcher_SHELL.0:MainThread Executing 20.7117 task.000069 task False ExecutingPending aec_adv_u\n",
- "2193 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.7125 task.000010 task False Executing aec_adv_u_pend\n",
- "2215 advance agent.0.AgentExecutingWatcher_SHELL.0:MainThread Executing 20.8313 task.000009 task False ExecutingPending aec_adv_u\n",
- "2222 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.8325 task.000069 task False Executing aec_adv_u_pend\n",
- "2362 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.9513 task.000009 task False Executing aec_adv_u_pend\n",
- "2398 advance agent.0.AgentExecutingWatcher_SHELL.0:MainThread Executing 21.0580 task.000071 task False AgentStagingInputPending aec_adv_u\n",
- "2470 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 21.1808 task.000071 task False ExecutingPending aec_adv_u_pend\n",
- "2475 advance agent.0.AgentExecutingWatcher_SHELL.0:MainThread Executing 21.1822 task.000011 task False ExecutingPending aec_adv_u\n",
- "2500 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 21.2993 task.000011 task False Executing aec_adv_u_pend"
+ "2167 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.5973 task.000070 task False ExecutingPending aec_adv_u_pend\n",
+ "2186 advance agent_0.AgentExecutingWatcher_SHELL.0:MainThread Executing 20.7117 task.000069 task False ExecutingPending aec_adv_u\n",
+ "2193 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.7125 task.000010 task False Executing aec_adv_u_pend\n",
+ "2215 advance agent_0.AgentExecutingWatcher_SHELL.0:MainThread Executing 20.8313 task.000009 task False ExecutingPending aec_adv_u\n",
+ "2222 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.8325 task.000069 task False Executing aec_adv_u_pend\n",
+ "2362 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.9513 task.000009 task False Executing aec_adv_u_pend\n",
+ "2398 advance agent_0.AgentExecutingWatcher_SHELL.0:MainThread Executing 21.0580 task.000071 task False AgentStagingInputPending aec_adv_u\n",
+ "2470 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 21.1808 task.000071 task False ExecutingPending aec_adv_u_pend\n",
+ "2475 advance agent_0.AgentExecutingWatcher_SHELL.0:MainThread Executing 21.1822 task.000011 task False ExecutingPending aec_adv_u\n",
+ "2500 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 21.2993 task.000011 task False Executing aec_adv_u_pend"
]
}
],
@@ -1113,7 +1113,7 @@
" 2167 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 20.5973 | \n",
" task.000070 | \n",
@@ -1127,7 +1127,7 @@
" 2186 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
" Executing | \n",
" 20.7117 | \n",
" task.000069 | \n",
@@ -1141,7 +1141,7 @@
" 2193 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 20.7125 | \n",
" task.000010 | \n",
@@ -1155,7 +1155,7 @@
" 2215 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
" Executing | \n",
" 20.8313 | \n",
" task.000009 | \n",
@@ -1169,7 +1169,7 @@
" 2222 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 20.8325 | \n",
" task.000069 | \n",
@@ -1183,7 +1183,7 @@
" 2362 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 20.9513 | \n",
" task.000009 | \n",
@@ -1197,7 +1197,7 @@
" 2398 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
" Executing | \n",
" 21.0580 | \n",
" task.000071 | \n",
@@ -1211,7 +1211,7 @@
" 2470 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 21.1808 | \n",
" task.000071 | \n",
@@ -1225,7 +1225,7 @@
" 2475 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
" Executing | \n",
" 21.1822 | \n",
" task.000011 | \n",
@@ -1239,7 +1239,7 @@
" 2500 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 21.2993 | \n",
" task.000011 | \n",
@@ -1253,7 +1253,7 @@
" 2538 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
" Executing | \n",
" 21.7797 | \n",
" task.000088 | \n",
@@ -1267,7 +1267,7 @@
" 2632 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
" Executing | \n",
" 21.8973 | \n",
" task.000072 | \n",
@@ -1281,7 +1281,7 @@
" 2638 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 21.8984 | \n",
" task.000088 | \n",
@@ -1295,7 +1295,7 @@
" 2769 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
" Executing | \n",
" 22.0198 | \n",
" task.000089 | \n",
@@ -1309,7 +1309,7 @@
" 2776 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 22.0206 | \n",
" task.000072 | \n",
@@ -1323,7 +1323,7 @@
" 2802 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
" Executing | \n",
" 22.1400 | \n",
" task.000012 | \n",
@@ -1337,7 +1337,7 @@
" 2809 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 22.1415 | \n",
" task.000089 | \n",
@@ -1351,7 +1351,7 @@
" 2840 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 22.2618 | \n",
" task.000012 | \n",
@@ -1365,7 +1365,7 @@
" 2871 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:MainThread | \n",
" Executing | \n",
" 22.8924 | \n",
" task.000020 | \n",
@@ -1379,7 +1379,7 @@
" 2922 | \n",
" advance | \n",
" | \n",
- " agent.0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
+ " agent_0.AgentExecutingWatcher_SHELL.0:Watcher | \n",
" AgentStagingOutputPending | \n",
" 23.0136 | \n",
" task.000020 | \n",
@@ -1397,26 +1397,26 @@
"prompt_number": 52,
"text": [
" event msg name state time uid entity cloned state_from info cc_exe\n",
- "2167 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.5973 task.000070 task False ExecutingPending aec_adv_u_pend 1\n",
- "2186 advance agent.0.AgentExecutingWatcher_SHELL.0:MainThread Executing 20.7117 task.000069 task False ExecutingPending aec_adv_u 2\n",
- "2193 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.7125 task.000010 task False Executing aec_adv_u_pend 1\n",
- "2215 advance agent.0.AgentExecutingWatcher_SHELL.0:MainThread Executing 20.8313 task.000009 task False ExecutingPending aec_adv_u 2\n",
- "2222 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.8325 task.000069 task False Executing aec_adv_u_pend 1\n",
- "2362 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.9513 task.000009 task False Executing aec_adv_u_pend 0\n",
- "2398 advance agent.0.AgentExecutingWatcher_SHELL.0:MainThread Executing 21.0580 task.000071 task False AgentStagingInputPending aec_adv_u 1\n",
- "2470 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 21.1808 task.000071 task False ExecutingPending aec_adv_u_pend 0\n",
- "2475 advance agent.0.AgentExecutingWatcher_SHELL.0:MainThread Executing 21.1822 task.000011 task False ExecutingPending aec_adv_u 1\n",
- "2500 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 21.2993 task.000011 task False Executing aec_adv_u_pend 0\n",
- "2538 advance agent.0.AgentExecutingWatcher_SHELL.0:MainThread Executing 21.7797 task.000088 task False AgentStagingInputPending aec_adv_u 1\n",
- "2632 advance agent.0.AgentExecutingWatcher_SHELL.0:MainThread Executing 21.8973 task.000072 task False ExecutingPending aec_adv_u 2\n",
- "2638 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 21.8984 task.000088 task False ExecutingPending aec_adv_u_pend 1\n",
- "2769 advance agent.0.AgentExecutingWatcher_SHELL.0:MainThread Executing 22.0198 task.000089 task False ExecutingPending aec_adv_u 2\n",
- "2776 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 22.0206 task.000072 task False Executing aec_adv_u_pend 1\n",
- "2802 advance agent.0.AgentExecutingWatcher_SHELL.0:MainThread Executing 22.1400 task.000012 task False ExecutingPending aec_adv_u 2\n",
- "2809 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 22.1415 task.000089 task False Executing aec_adv_u_pend 1\n",
- "2840 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 22.2618 task.000012 task False Executing aec_adv_u_pend 0\n",
- "2871 advance agent.0.AgentExecutingWatcher_SHELL.0:MainThread Executing 22.8924 task.000020 task False AgentStagingInputPending aec_adv_u 1\n",
- "2922 advance agent.0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 23.0136 task.000020 task False ExecutingPending aec_adv_u_pend 0"
+ "2167 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.5973 task.000070 task False ExecutingPending aec_adv_u_pend 1\n",
+ "2186 advance agent_0.AgentExecutingWatcher_SHELL.0:MainThread Executing 20.7117 task.000069 task False ExecutingPending aec_adv_u 2\n",
+ "2193 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.7125 task.000010 task False Executing aec_adv_u_pend 1\n",
+ "2215 advance agent_0.AgentExecutingWatcher_SHELL.0:MainThread Executing 20.8313 task.000009 task False ExecutingPending aec_adv_u 2\n",
+ "2222 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.8325 task.000069 task False Executing aec_adv_u_pend 1\n",
+ "2362 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 20.9513 task.000009 task False Executing aec_adv_u_pend 0\n",
+ "2398 advance agent_0.AgentExecutingWatcher_SHELL.0:MainThread Executing 21.0580 task.000071 task False AgentStagingInputPending aec_adv_u 1\n",
+ "2470 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 21.1808 task.000071 task False ExecutingPending aec_adv_u_pend 0\n",
+ "2475 advance agent_0.AgentExecutingWatcher_SHELL.0:MainThread Executing 21.1822 task.000011 task False ExecutingPending aec_adv_u 1\n",
+ "2500 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 21.2993 task.000011 task False Executing aec_adv_u_pend 0\n",
+ "2538 advance agent_0.AgentExecutingWatcher_SHELL.0:MainThread Executing 21.7797 task.000088 task False AgentStagingInputPending aec_adv_u 1\n",
+ "2632 advance agent_0.AgentExecutingWatcher_SHELL.0:MainThread Executing 21.8973 task.000072 task False ExecutingPending aec_adv_u 2\n",
+ "2638 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 21.8984 task.000088 task False ExecutingPending aec_adv_u_pend 1\n",
+ "2769 advance agent_0.AgentExecutingWatcher_SHELL.0:MainThread Executing 22.0198 task.000089 task False ExecutingPending aec_adv_u 2\n",
+ "2776 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 22.0206 task.000072 task False Executing aec_adv_u_pend 1\n",
+ "2802 advance agent_0.AgentExecutingWatcher_SHELL.0:MainThread Executing 22.1400 task.000012 task False ExecutingPending aec_adv_u 2\n",
+ "2809 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 22.1415 task.000089 task False Executing aec_adv_u_pend 1\n",
+ "2840 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 22.2618 task.000012 task False Executing aec_adv_u_pend 0\n",
+ "2871 advance agent_0.AgentExecutingWatcher_SHELL.0:MainThread Executing 22.8924 task.000020 task False AgentStagingInputPending aec_adv_u 1\n",
+ "2922 advance agent_0.AgentExecutingWatcher_SHELL.0:Watcher AgentStagingOutputPending 23.0136 task.000020 task False ExecutingPending aec_adv_u_pend 0"
]
}
],
diff --git a/bin/radical-pilot-agent b/bin/radical-pilot-agent
deleted file mode 100755
index 33cec7c9a1..0000000000
--- a/bin/radical-pilot-agent
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/usr/bin/env python3
-
-__copyright__ = "Copyright 2014-2016, http://radical.rutgers.edu"
-__license__ = "MIT"
-
-
-import os
-import sys
-import time
-import setproctitle
-
-import radical.utils as ru
-import radical.pilot as rp
-
-
-# ------------------------------------------------------------------------------
-#
-def bootstrap_3(aid):
- """
- This method continues where the bootstrap_0/1/2 left off, and will now pass
- control to the Agent class which will spawn the functional components.
- Before doing so, we will check if we happen to be agent instance zero. If
- that is the case, some additional python level bootstrap routines kick in,
- to set the stage for component and sub-agent spawning.
-
- The agent interprets a config file, which will specify in an 'agents'
- section:
- - what nodes should be used for sub-agent startup
- - what bridges should be started
- - what are the endpoints for bridges which are not started
- - what components should be started
- agent.0 will create derived config files for all sub-agents.
- """
-
- print("bootstrap agent %s" % aid)
-
- agent = None
-
- try:
- setproctitle.setproctitle('rp.%s' % aid)
-
- cfg = ru.Config(path='%s.cfg' % aid)
- cfg.uid = aid
- cfg.aid = aid # used by executor
- cfg.path = os.getcwd()
- cfg.base = os.getcwd()
-
- # start a non-primary session (the agents will own their cmgrs)
- session = rp.Session(cfg=cfg, _primary=False)
-
- if aid == 'agent.0': agent = rp.Agent_0(cfg, session)
- else : agent = rp.Agent_n(cfg, session)
-
- agent.start()
-
- # wait until the agent finishes or fails.
- while True:
- time.sleep(0.1)
-
- except:
- print('failed %s' % aid)
- ru.print_exception_trace()
-
- finally:
- # in all cases, make sure we perform an orderly shutdown. I hope python
- # does not mind doing all those things in a finally clause of
- # (essentially) main...
- print('finalize %s' % aid)
-
- if agent:
- agent.stop()
- print('stopped %s' % aid)
-
-
-# ------------------------------------------------------------------------------
-#
-if __name__ == "__main__":
-
- # FIXME: daemonization a'la component
-
- if len(sys.argv) != 2:
- raise RuntimeError('missing parameter: agent id')
-
- bootstrap_3(sys.argv[1])
-
-
-# ------------------------------------------------------------------------------
diff --git a/bin/radical-pilot-agent_0 b/bin/radical-pilot-agent_0
new file mode 100755
index 0000000000..ed2d5e2286
--- /dev/null
+++ b/bin/radical-pilot-agent_0
@@ -0,0 +1,19 @@
+#!/usr/bin/env python3
+
+__copyright__ = "Copyright 2014-2019, http://radical.rutgers.edu"
+__license__ = "MIT"
+
+import radical.pilot as rp
+
+
+# ------------------------------------------------------------------------------
+#
+if __name__ == "__main__":
+
+ agent = rp.Agent_0()
+ agent.start()
+ agent.wait()
+
+
+# ------------------------------------------------------------------------------
+
diff --git a/bin/radical-pilot-agent_n b/bin/radical-pilot-agent_n
new file mode 100755
index 0000000000..ba3c8b6502
--- /dev/null
+++ b/bin/radical-pilot-agent_n
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+
+__copyright__ = "Copyright 2014-2019, http://radical.rutgers.edu"
+__license__ = "MIT"
+
+
+import os
+from re import S
+import sys
+import time
+
+import threading as mt
+import setproctitle as spt
+
+import radical.utils as ru
+import radical.pilot as rp
+
+from radical.pilot.messages import HeartbeatMessage
+
+# ------------------------------------------------------------------------------
+#
+def main(sid, reg_addr, uid):
+ '''
+ This method continues where the bootstrap_0/1/2 left off, and will now pass
+ control to the Agent class which will spawn the functional components.
+ Before doing so, we will check if we happen to be agent instance zero. If
+ that is the case, some additional python level bootstrap routines kick in,
+ to set the stage for component and sub-agent spawning.
+
+ The agent interprets a config file, which will specify in an 'agents'
+ section:
+ - what nodes should be used for sub-agent startup
+ - what bridges should be started
+ - what are the endpoints for bridges which are not started
+ - what components should be started
+ agent_0 will create derived config files for all sub-agents.
+ '''
+
+ # basic setup: logger and profiler
+ log = ru.Logger(name=uid, ns='radical.pilot', path=os.getcwd())
+ prof = ru.Profiler(name=uid, ns='radical.pilot', path=os.getcwd())
+
+ try:
+ prof.prof('comp_start', uid=uid)
+ prof.disable()
+ wrapped_main(sid, reg_addr, uid, log, prof)
+
+ finally:
+ prof.enable()
+ prof.prof('comp_stop', uid=uid)
+
+
+# ------------------------------------------------------------------------------
+#
+def wrapped_main(sid, reg_addr, uid, log, prof):
+
+ spt.setproctitle('rp.%s' % uid)
+
+ term = mt.Event()
+ reg = ru.zmq.RegistryClient(url=reg_addr)
+
+ hb_cfg = ru.TypedDict(reg['heartbeat'])
+ cfg = ru.TypedDict(reg['cfg'])
+
+ reg.close()
+
+ agent = run_agent_n(sid, reg_addr, uid, log, prof)
+
+ agent.start()
+
+ # agent runs - send heartbeats so that session knows about it
+ hb_pub = ru.zmq.Publisher('heartbeat', hb_cfg.addr_pub, log=log, prof=prof)
+
+ def hb_beat_cb():
+ hb_pub.put('heartbeat', HeartbeatMessage(uid=uid))
+
+ def hb_term_cb(hb_uid):
+ agent.stop()
+ term.set()
+ return False
+
+ hb = ru.Heartbeat(uid=cfg.uid,
+ timeout=hb_cfg.timeout,
+ interval=hb_cfg.interval,
+ beat_cb=hb_beat_cb,
+ term_cb=hb_term_cb,
+ log=log)
+ hb.start()
+
+ # always watch out for session heartbeat
+ hb.watch(uid=sid)
+
+ # react on session heartbeats
+ def hb_sub_cb(topic, msg):
+ hb_msg = HeartbeatMessage(from_dict=msg)
+ if hb_msg.uid == sid:
+ hb.beat(uid=sid)
+
+ ru.zmq.Subscriber('heartbeat', hb_cfg.addr_sub,
+ topic='heartbeat', cb=hb_sub_cb,
+ log=log, prof=prof)
+
+ # all is set up - we can sit idle 'til end of time.
+ while not term.is_set():
+ time.sleep(1)
+
+
+# ------------------------------------------------------------------------------
+#
+def run_agent_n(sid, reg_addr, uid, log, prof):
+
+ reg = ru.zmq.RegistryClient(url=reg_addr)
+
+ hb_cfg = ru.Config(cfg=reg['heartbeat'])
+ s_cfg = ru.Config(cfg=reg['cfg'])
+ a_cfg = ru.Config(cfg=reg['agent.%s.cfg' % uid])
+
+ reg.close()
+
+ session = rp.Session(uid=sid, cfg=s_cfg,
+ _role=rp.Session._AGENT_N, _reg_addr=reg_addr)
+ agent = rp.Agent_n(a_cfg, session)
+
+ return agent
+
+
+# ------------------------------------------------------------------------------
+#
+if __name__ == "__main__":
+
+ if len(sys.argv) != 4:
+ sys.stderr.write('error: invalid arguments\n'
+ 'usage: %s \n' % sys.argv[0])
+ raise RuntimeError('invalid arguments: %s' % sys.argv)
+
+ sid = sys.argv[1]
+ reg_addr = sys.argv[2]
+ uid = sys.argv[3]
+
+ ru.daemonize(main=main, args=[sid, reg_addr, uid],
+ stdout='%s.out' % uid, stderr='%s.err' % uid)
+ sys.exit(0)
+
+
+# ------------------------------------------------------------------------------
+
diff --git a/bin/radical-pilot-bridge b/bin/radical-pilot-bridge
index b3b6fa63fe..3623b2d078 100755
--- a/bin/radical-pilot-bridge
+++ b/bin/radical-pilot-bridge
@@ -4,6 +4,7 @@ __copyright__ = "Copyright 2014-2019, http://radical.rutgers.edu"
__license__ = "MIT"
+import os
import sys
import time
@@ -12,10 +13,12 @@ import setproctitle as spt
import radical.utils as ru
+from radical.pilot.messages import HeartbeatMessage
+
# ------------------------------------------------------------------------------
#
-def main(cfg):
+def main(sid, reg_addr, uid):
'''
This thin wrapper starts a ZMQ bridge. It expects a single argument:
a config to use for the bridge's configuration. The config must contain:
@@ -27,14 +30,10 @@ def main(cfg):
If the config contains a `heartbeat` section, that section must be formatted
as follows:
- RCT comm bridges can be monitored via heartbeats (using a bridge-less pubsub
- channel). To enable that monitoring, the config should contains
- a `heartbeat` section, that section must be formatted as follows:
-
{
'from' : 'uid',
- 'pub' : 'addr_pub',
- 'sub' : 'addr_sub',
+ 'addr_pub': 'addr_pub',
+ 'addr_sub': 'addr_sub',
'interval': ,
'timeout' :
}
@@ -53,92 +52,103 @@ def main(cfg):
The config file may contain other entries which are passed to the bridge
and are interpreted by the bridge implementation.
- After startup, the bridge's communication endpoint URLs are stored in a
- file `$uid.cfg`, in the form (shown for pubsub and queue type bridges):
+ After startup, the bridge's communication endpoint URLs are stored in the
+ sessions registry under `bridges.`, in the form (shown for
+ pubsub and queue type bridges):
{
- 'uid': '$bridge.uid',
- 'pub': '$addr_pub',
- 'sub': '$addr_sub'
+ 'addr_pub': '$addr_pub',
+ 'addr_sub': '$addr_sub'
}
{
- 'uid': '$bridge.uid',
- 'put': '$addr_put',
- 'get': '$addr_get'
+ 'addr_put': '$addr_put',
+ 'addr_get': '$addr_get'
}
That config is formed so that any publishers, subscribers, putters or getters
can obtain the respective bridge addresses automatically. This also holds
for command line tools like:
- > radical-pilot-bridge command.cfg [1]
- > radical-pilot-sub command foo & [2]
- > radical-pilot-pub command foo bar [3]
+ > radical-pilot-bridge sid reg_addr test_pubsub.0000 [1]
+ > radical-pilot-sub sid reg_addr test_pubsub.0000 foo & [2]
+ > radical-pilot-pub sid reg_addr test_pubsub.0000 foo bar [3]
[1] establishes the pubsub channel 'command'
[2] connect to the command channel, subscribe for topic `foo`
[3] connect to the command channel, send messages for topic `foo`
'''
- # basic setup: cfg, logger and profiler
- log = ru.Logger(name=cfg.uid, ns='radical.pilot', path=cfg.path)
- prof = ru.Profiler(name=cfg.uid, ns='radical.pilot', path=cfg.path)
+ # basic setup: logger and profiler
+ log = ru.Logger(name=uid, ns='radical.pilot', path=os.getcwd())
+ prof = ru.Profiler(name=uid, ns='radical.pilot', path=os.getcwd())
try:
- prof.prof('bridge_start', uid=cfg.uid)
+ prof.prof('bridge_start', uid=uid)
prof.disable()
- wrapped_main(cfg, log, prof)
+ wrapped_main(sid, reg_addr, uid, log, prof)
+
finally:
prof.enable()
- prof.prof('bridge_stop', uid=cfg.uid)
+ prof.prof('bridge_stop', uid=uid)
+
+# ------------------------------------------------------------------------------
+#
+def wrapped_main(sid, reg_addr, uid, log, prof):
-def wrapped_main(cfg, log, prof):
+ sys.stdout = ru.ru_open('/tmp/%s.out' % uid, 'w')
+ sys.stderr = ru.ru_open('/tmp/%s.err' % uid, 'w')
+
+ spt.setproctitle('rp.%s' % uid)
term = mt.Event()
+ reg = ru.zmq.RegistryClient(url=reg_addr)
- spt.setproctitle('rp.%s' % cfg.uid)
+ hb_cfg = ru.TypedDict(reg['heartbeat'])
+ b_cfg = ru.TypedDict(reg['bridges.%s.cfg' % uid])
- # create the bridge, store connection addresses in FS, and begin to work
- bridge = ru.zmq.Bridge.create(cfg)
+ # create the instance and begin to work
+ bridge = ru.zmq.Bridge.create(uid, cfg=b_cfg)
- ru.write_json('%s/%s.cfg' % (cfg.path, cfg.uid),
- {'uid' : cfg.uid,
- bridge.type_in : str(bridge.addr_in),
- bridge.type_out: str(bridge.addr_out)})
+ reg['bridges.%s.addr_%s' % (uid, bridge.type_in )] = str(bridge.addr_in)
+ reg['bridges.%s.addr_%s' % (uid, bridge.type_out)] = str(bridge.addr_out)
+ reg.close()
bridge.start()
+ if 'pubsub' in uid:
+ d = ru.zmq.test_pubsub(bridge.channel, bridge.addr_pub, bridge.addr_sub)
+
# bridge runs - send heartbeats so that cmgr knows about it
- hb_pub = ru.zmq.Publisher('heartbeat', cfg.heartbeat.addr_pub,
- log=log, prof=prof)
+ hb_pub = ru.zmq.Publisher('heartbeat', hb_cfg.addr_pub, log=log, prof=prof)
def hb_beat_cb():
- hb_pub.put('heartbeat', msg={'uid': cfg.uid})
+ hb_pub.put('heartbeat', HeartbeatMessage(uid=uid))
def hb_term_cb(hb_uid):
bridge.stop()
term.set()
- return None
+ return False
- hb = ru.Heartbeat(uid=cfg.uid,
- timeout=cfg.heartbeat.timeout,
- interval=cfg.heartbeat.interval,
+ hb = ru.Heartbeat(uid=uid,
+ timeout=hb_cfg.timeout,
+ interval=hb_cfg.interval,
beat_cb=hb_beat_cb,
term_cb=hb_term_cb,
log=log)
hb.start()
- # register cmgr heartbeat by beating once
- hb.beat(uid=cfg.cmgr)
+ # always watch out for session heartbeat
+ hb.watch(uid=sid)
- # record cmgr heartbeats
+ # react on session heartbeats
def hb_sub_cb(topic, msg):
- if msg['uid'] == cfg.cmgr:
- hb.beat(uid=cfg.cmgr)
+ hb_msg = HeartbeatMessage(from_dict=msg)
+ if hb_msg.uid == sid:
+ hb.beat(uid=sid)
- ru.zmq.Subscriber('heartbeat', cfg.heartbeat.addr_sub,
+ ru.zmq.Subscriber('heartbeat', hb_cfg.addr_sub,
topic='heartbeat', cb=hb_sub_cb,
log=log, prof=prof)
@@ -151,17 +161,17 @@ def wrapped_main(cfg, log, prof):
#
if __name__ == "__main__":
- if len(sys.argv) != 2:
+ if len(sys.argv) != 4:
sys.stderr.write('error: invalid arguments\n'
- 'usage: %s \n' % sys.argv[0])
+ 'usage: %s \n' % sys.argv[0])
raise RuntimeError('invalid arguments: %s' % sys.argv)
- fname = sys.argv[1]
- cfg = ru.Config(path=fname)
- path = '%s/%s' % (cfg.path, cfg.uid)
+ sid = sys.argv[1]
+ reg_addr = sys.argv[2]
+ uid = sys.argv[3]
- ru.daemonize(main=main, args=[cfg], stdout='%s.out' % path,
- stderr='%s.err' % path)
+ ru.daemonize(main=main, args=[sid, reg_addr, uid],
+ stdout='%s.out' % uid, stderr='%s.err' % uid)
sys.exit(0)
diff --git a/bin/radical-pilot-component b/bin/radical-pilot-component
index df9e35d91d..a652d2507f 100755
--- a/bin/radical-pilot-component
+++ b/bin/radical-pilot-component
@@ -4,6 +4,7 @@ __copyright__ = "Copyright 2014-2019, http://radical.rutgers.edu"
__license__ = "MIT"
+import os
import sys
import time
@@ -13,12 +14,11 @@ import setproctitle as spt
import radical.utils as ru
import radical.pilot as rp
-dh = ru.DebugHelper()
-
+from radical.pilot.messages import HeartbeatMessage
# ------------------------------------------------------------------------------
#
-def main(cfg):
+def main(sid, reg_addr, uid):
'''
This thin wrapper starts a RCT component It expects a single argument:
a config to use for the component's configuration. The config must contain:
@@ -32,8 +32,8 @@ def main(cfg):
{
'from' : 'uid',
- 'pub' : 'addr_pub',
- 'sub' : 'addr_sub',
+ 'addr_pub': 'addr_pub',
+ 'addr_sub': 'addr_sub',
'interval': ,
'timeout' :
}
@@ -53,60 +53,71 @@ def main(cfg):
and are interpreted by the component implementation.
'''
- # basic setup: cfg, logger and profiler
- log = ru.Logger(name=cfg.uid, ns='radical.pilot', path=cfg.path)
- prof = ru.Profiler(name=cfg.uid, ns='radical.pilot', path=cfg.path)
+ # basic setup: logger and profiler
+ log = ru.Logger(name=uid, ns='radical.pilot', path=os.getcwd())
+ prof = ru.Profiler(name=uid, ns='radical.pilot', path=os.getcwd())
try:
- prof.prof('comp_start', uid=cfg.uid)
+ prof.prof('comp_start', uid=uid)
prof.disable()
- wrapped_main(cfg, log, prof)
+ wrapped_main(sid, reg_addr, uid, log, prof)
+
finally:
prof.enable()
- prof.prof('comp_stop', uid=cfg.uid)
+ prof.prof('comp_stop', uid=uid)
+
+# ------------------------------------------------------------------------------
+#
+def wrapped_main(sid, reg_addr, uid, log, prof):
-def wrapped_main(cfg, log, prof):
+ spt.setproctitle('rp.%s' % uid)
term = mt.Event()
+ reg = ru.zmq.RegistryClient(url=reg_addr)
+
+ hb_cfg = ru.TypedDict(reg['heartbeat'])
+ c_cfg = ru.TypedDict(reg['components.%s.cfg' % uid])
- spt.setproctitle('rp.%s' % cfg.uid)
+ reg.close()
# start a non-primary session
- session = rp.Session(cfg=cfg, _primary=False)
+ session = rp.Session(uid=sid, cfg=c_cfg,
+ _role=rp.Session._DEFAULT, _reg_addr=reg_addr)
- # create the component and begin to work
- comp = rp.utils.Component.create(cfg, session)
+ # create the instance and begin to work
+ comp = rp.utils.Component.create(c_cfg, session)
comp.start()
- # component runs - send heartbeats so that cmgr knows about it
- hb_pub = ru.zmq.Publisher('heartbeat', cfg.heartbeat.addr_pub) #, log=log)
+ # component runs - send heartbeats so that session knows about it
+ hb_pub = ru.zmq.Publisher('heartbeat', hb_cfg.addr_pub, log=log, prof=prof)
def hb_beat_cb():
- hb_pub.put('heartbeat', msg={'uid': cfg.uid})
+ hb_pub.put('heartbeat', HeartbeatMessage(uid=uid))
def hb_term_cb(hb_uid):
comp.stop()
term.set()
- return None
+ return False
- hb = ru.Heartbeat(uid=cfg.uid,
- timeout=cfg.heartbeat.timeout,
- interval=cfg.heartbeat.interval,
+ hb = ru.Heartbeat(uid=uid,
+ timeout=hb_cfg.timeout,
+ interval=hb_cfg.interval,
beat_cb=hb_beat_cb,
term_cb=hb_term_cb,
log=log)
hb.start()
- # register cmgr heartbeat by beating once
- hb.beat(uid=cfg.cmgr)
+ # always watch out for session heartbeat
+ hb.watch(uid=sid)
- # record cmgr heartbeats
+ # react on session heartbeats
def hb_sub_cb(topic, msg):
- if msg['uid'] == cfg.cmgr:
- hb.beat(uid=cfg.cmgr)
+ hb_msg = HeartbeatMessage(from_dict=msg)
+ if hb_msg.uid == sid:
+ hb.beat(uid=sid)
- ru.zmq.Subscriber('heartbeat', cfg.heartbeat.addr_sub,
+ ru.zmq.Subscriber('heartbeat', hb_cfg.addr_sub,
topic='heartbeat', cb=hb_sub_cb,
log=log, prof=prof)
@@ -119,17 +130,17 @@ def wrapped_main(cfg, log, prof):
#
if __name__ == "__main__":
- if len(sys.argv) != 2:
+ if len(sys.argv) != 4:
sys.stderr.write('error: invalid arguments\n'
- 'usage: %s \n' % sys.argv[0])
+ 'usage: %s \n' % sys.argv[0])
raise RuntimeError('invalid arguments: %s' % sys.argv)
- fname = sys.argv[1]
- cfg = ru.Config(path=fname)
- path = '%s/%s' % (cfg.path, cfg.uid)
+ sid = sys.argv[1]
+ reg_addr = sys.argv[2]
+ uid = sys.argv[3]
- ru.daemonize(main=main, args=[cfg], stdout='%s.out' % path,
- stderr='%s.err' % path)
+ ru.daemonize(main=main, args=[sid, reg_addr, uid],
+ stdout='%s.out' % uid, stderr='%s.err' % uid)
sys.exit(0)
diff --git a/bin/radical-pilot-proxy-server b/bin/radical-pilot-proxy-server
new file mode 100755
index 0000000000..3f3a7a949a
--- /dev/null
+++ b/bin/radical-pilot-proxy-server
@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+
+__copyright__ = "Copyright 2013-2022, http://radical.rutgers.edu"
+__license__ = "MIT"
+
+
+import sys
+import time
+
+import radical.utils as ru
+import radical.pilot as rp
+
+
+# ------------------------------------------------------------------------------
+#
+if __name__ == '__main__':
+
+ bridge = rp.Proxy()
+
+ try:
+ sys.stdout.write('uid : %s\n' % bridge.uid)
+ sys.stdout.flush()
+
+ bridge.start()
+
+ sys.stdout.write('addr: %s\n' % bridge.addr)
+ ru.write_json('%s.cfg' % bridge.uid, {'addr': bridge.addr})
+
+ # run forever until process is interrupted or killed
+ while True:
+ time.sleep(1)
+
+ finally:
+ bridge.stop()
+ bridge.wait()
+
+
+# ------------------------------------------------------------------------------
+
diff --git a/bin/radical-pilot-prte2prof b/bin/radical-pilot-prte2prof
index f8cbbde3fc..9abec0883c 100755
--- a/bin/radical-pilot-prte2prof
+++ b/bin/radical-pilot-prte2prof
@@ -83,7 +83,7 @@ def handle_line(prof, line, pid, idmap):
# print elems[2:]
# ... DEBUG : prte output: [batch3:80017] [[33357,0],0] [1565343424.463101] ACTIVATE JOB [33357,0] STATE PENDING ALLOCATION AT ../../../../../../../source/prrte-dev/orte/tools/prte/prte.c:497
# ['... DEBUG : prte output:', 'batch3:80017', '33357,0', ',0', '1565343424.463101', 'ACTIVATE JOB', '33357,0', 'STATE PENDING ALLOCATION', '../../../../../../../source/prrte-dev/orte/tools/prte/prte.c:497']
- # _ '2019-08-09 05:37:34,815: agent.0 : MainProcess : DVMWatcher : DEBUG : prte output:',
+ # _ '2019-08-09 05:37:34,815: agent_0 : MainProcess : DVMWatcher : DEBUG : prte output:',
# node 'batch3:80017',
# dvm '33357,0',
# dvmd ',0'
diff --git a/bin/radical-pilot-run-session b/bin/radical-pilot-run-session
index 1e4bf96850..a45b2163b9 100755
--- a/bin/radical-pilot-run-session
+++ b/bin/radical-pilot-run-session
@@ -24,7 +24,7 @@ def run_record(rec):
dburl = s_dict.get('dburl')
rep.info('session dburl: %s' % dburl)
- session = rp.Session(database_url=dburl)
+ session = rp.Session()
rep.ok('session uid : %s' % session.uid)
pmgr = rp.PilotManager(session=session)
diff --git a/bin/radical-pilot-service-signal b/bin/radical-pilot-service-signal
new file mode 100755
index 0000000000..6a200d591f
--- /dev/null
+++ b/bin/radical-pilot-service-signal
@@ -0,0 +1,56 @@
+#!/bin/sh
+
+HELP=$(cat <
+
+ uid: UID of the service whose startup completed
+
+
+This script is expected to be executed by a service instance which was started
+by the pilot agent. The agent will block any further activity until all started
+services signal theor readiness. A service specification may define a timeout
+after which the startup is declaired as failed and the agent will abort.
+
+Internally the script will activate the agent's virtualenv and then run a small
+embedded Python script which sends a message to the Agent's control channel,
+informing it about the service startup.
+EOT
+)
+
+SCRIPT=$(cat < None:
+
+ self._ctx = zmq.Context()
+ self._url = None
+ self._thread = None
+ self._term = mt.Event()
+
+
+ # --------------------------------------------------------------------------
+ #
+ @property
+ def url(self):
+ return self._url
+
+
+ # --------------------------------------------------------------------------
+ #
+ def listen(self, url: str = None):
+
+ if not url:
+ url = 'tcp://*:*'
+
+ if self._url:
+ raise RuntimeError('already connected at %s' % self._url)
+
+ self._sock = self._ctx.socket(zmq.SERVER)
+ self._sock.bind(url)
+
+ self._url = self._sock.getsockopt(zmq.LAST_ENDPOINT)
+
+ self._thread = mt.Thread(target=self._work)
+ self._thread.daemon = True
+ self._thread.start()
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _work(self):
+
+ poller = zmq.Poller()
+ poller.register(self._sock, zmq.POLLIN)
+
+ while not self._term:
+
+ info = poller.poll()
+ if info:
+ msg = msgpack.unpackb(self._sock.recv())
+ print('< %s' % msg)
+ msg['foo': 1]
+ self._sock.send(msgpack.packb(msg))
+ print('> %s' % msg)
+
+
+# ------------------------------------------------------------------------------
+#
+class Client(object):
+
+ # --------------------------------------------------------------------------
+ #
+ def __init__(self) -> None:
+
+ self._ctx = zmq.Context()
+ self._url = None
+
+
+ # --------------------------------------------------------------------------
+ #
+ @property
+ def url(self):
+ return self._url
+
+
+ # --------------------------------------------------------------------------
+ #
+ def connect(self, url: str = None):
+
+ if self._url:
+ raise RuntimeError('already connected at %s' % self._url)
+
+ self._sock = self._ctx.socket(zmq.CLIENT)
+ self._sock.connect(url)
+
+ self._url = self._sock.getsockopt(zmq.LAST_ENDPOINT)
+
+
+ # --------------------------------------------------------------------------
+ #
+ def work(self):
+
+ for i in range(3):
+
+ msg = {'cnt': i}
+ self._sock.send(msgpack.packb(msg))
+ print('> %s' % msg)
+
+ rep = msgpack.unpackb(self._sock.recv())
+ print('> %s' % rep)
+
+
+# ------------------------------------------------------------------------------
+#
+if __name__ == '__main__':
+
+ server = Server()
+ server.listen()
+
+ client = Client()
+ client.connect(server.url)
+
+
diff --git a/docs/source/README.md b/docs/source/README.md
index 5da0633889..8dc2a59ff7 100644
--- a/docs/source/README.md
+++ b/docs/source/README.md
@@ -32,7 +32,6 @@ RP's documentation uses [Sphinx](https://www.sphinx-doc.org/en/master/index.html
```shell
cd docs
- export RADICAL_PILOT_DBURL=
sphinx-build source _build -b html
```
@@ -55,4 +54,3 @@ RP's documentation uses [Sphinx](https://www.sphinx-doc.org/en/master/index.html
- Branch name
- Requirements File: relative path to requirements file
- Documentation Type: Select `Sphinx Html`
-- Environment Variable: `RADICAL_PILOT_DBURL` set as private as it contains auth tokens.
diff --git a/docs/source/envs.rst b/docs/source/envs.rst
index c992ca9f9b..cb062e0460 100644
--- a/docs/source/envs.rst
+++ b/docs/source/envs.rst
@@ -24,9 +24,6 @@ End user
* - .. envvar:: RADICAL_BASE
- Root directory where to save temporary state files
- `$HOME/.radical/`
- * - .. envvar:: RADICAL_PILOT_DBURL
- - MongoDB URI string. Mandatory for RP to work
- - {NOT_SET}
* - .. envvar:: RADICAL_UTILS_NTPHOST
- NTP host used for profile syncing
- `0.pool.ntp.org`
diff --git a/docs/source/getting_started.ipynb b/docs/source/getting_started.ipynb
index fab57f1880..b14f80d8ef 100644
--- a/docs/source/getting_started.ipynb
+++ b/docs/source/getting_started.ipynb
@@ -27,7 +27,7 @@
"\n",
"\n",
" \n",
- "__Note:__ Please see [using virtual environments](envs.rst) with RP for more options and detailed information. That will be especially useful when executing RP on supported high performance computing [(HPC) platforms](supported.rst).\n",
+ "__Note:__ Please see [using virtual environments](envs.rst) with RP for more options and detailed information. That will be especially useful when executing RP on [supported high performance computing (HPC) platforms](supported.rst).\n",
"\n",
"
\n",
"\n",
@@ -131,46 +131,6 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## MongoDB\n",
- "\n",
- "\n",
- " \n",
- "__Warning:__ RP 1.40 will **not** require a MongoDB server.\n",
- "\n",
- "
\n",
- "\n",
- "RP <1.40 requires a MongoDB server to execute. Without one, RP will throw an error and exit. You have multiple options, depending on where you are executing RP and for what application.\n",
- "\n",
- "### Executing RADICAL-Pilot on your local GNU/Linux workstation\n",
- "\n",
- "* [Install MongoDB](https://www.mongodb.com/docs/manual/administration/install-on-linux/) locally.\n",
- "* Use the MongoDB default configuration.\n",
- "\n",
- "### Executing RADICAL-Pilot on a supported HPC platform\n",
- "\n",
- "[Contact](https://github.com/radical-cybertools/radical.pilot/issues) the RADICAL development team, and we will provide you a viable solution.\n",
- "\n",
- "### Configuring RADICAL-Pilot to use a MongoDB server\n",
- "\n",
- "Export the following shell variable in the shell from which you will execute your RP application:\n",
- "\n",
- "```shell\n",
- "export RADICAL_PILOT_DBURL='mongodb://login:password@address:port/db_name'\n",
- "```\n",
- "\n",
- "Where:\n",
- "\n",
- "* `login`: needed only when using a supported HPC platform.\n",
- "* `address`: will be 127.0.0.1 when using RP locally.\n",
- "* `port`: will be 27017 when using RP locally; possibly different when using a supported HPC platform.\n",
- "* `db_name`: needed only when using a supported HPC platform.\n",
- "\n",
- "\n",
- " \n",
- "__Note:__ When executing a MongoDB locally with a default configuration, you will have to use: `export RADICAL_PILOT_DBURL='mongodb://127.0.0.1:27017`. No `login`/`password` or `db_name` needed.\n",
- "\n",
- "
\n",
- "\n",
"## Write your first application\n",
"\n",
"RP executes in batch mode:\n",
@@ -199,7 +159,7 @@
"1. Submit tasks for execution\n",
"1. Wait for tasks to complete execution\n",
"\n",
- "As we have already seen with `RADICAL_PILOT_DBURL`, some of RP behavior can be configured via environment variables. RP's progression bar does not work properly with Jupyter notebooks. Thus, we set it to FALSE."
+ "Some of RP behavior can be configured via environment variables. RP's progression bar does not work properly with Jupyter notebooks. Thus, you may want to set it to FALSE."
]
},
{
@@ -231,7 +191,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "As with every Python application, first import all the required modules."
+ "As with every Python application, first you import all the required modules."
]
},
{
@@ -257,7 +217,7 @@
"source": [
"### Enable user feedback\n",
"\n",
- "As RP implements a batch programming model, by default, it returns a minimal amount of information. After submitting the tasks for execution, RP will remain silent until all the tasks have completed. In practice, when developing and debugging your application, you will want more feedback. We wrote a reporter module that you can use with RP and all the other RADICAL Cybertools.\n",
+ "As RP implements a batch programming model, by default, it returns a minimal amount of information. After submitting the tasks for execution, RP will remain silent until all the tasks have completed. In practice, when developing and debugging your application, you will want more feedback. We wrote a reporter module that you can use with RP and all the other RADICAL-Cybertools.\n",
"\n",
"To use the reporter:\n",
"\n",
diff --git a/docs/source/images/architecture.png b/docs/source/images/architecture.png
old mode 100755
new mode 100644
index a550067179..30ebe0446a
Binary files a/docs/source/images/architecture.png and b/docs/source/images/architecture.png differ
diff --git a/docs/source/supported/amarel.rst b/docs/source/supported/amarel.rst
index 83127b5f6c..440354953f 100644
--- a/docs/source/supported/amarel.rst
+++ b/docs/source/supported/amarel.rst
@@ -51,7 +51,7 @@ General description
.. note::
In order to be able to access Amarel cluster, you must be connected to
Rutgers Virtual Private Network (VPN) with a valid Rutgers ``netid``.
-
+
.. note::
@@ -97,18 +97,6 @@ Install RADICAL-Pilot after activating a corresponding virtual environment:
pip install radical.pilot
-MongoDB
--------
-
-MongoDB service is **not** provided by Amarel cluster, thus, you have to use
-either your running instance of MongoDB service or contact the RADICAL team by
-opening a `ticket `_.
-
-RADICAL-Pilot will connect to the MongoDB instance using a corresponding URI:
-
-.. code-block:: bash
-
- export RADICAL_PILOT_DBURL=""
Launching script example
========================
@@ -125,7 +113,6 @@ launching command for the application itself.
module load python
source ve.rp/bin/activate
- export RADICAL_PILOT_DBURL="mongodb://localhost:27017/"
export RADICAL_PROFILE=TRUE
# for debugging purposes
export RADICAL_LOG_LVL=DEBUG
diff --git a/docs/source/supported/bridges2.rst b/docs/source/supported/bridges2.rst
index 6a617a5e68..70c5b66d3f 100644
--- a/docs/source/supported/bridges2.rst
+++ b/docs/source/supported/bridges2.rst
@@ -82,18 +82,6 @@ Install RADICAL-Pilot after activating a corresponding virtual environment:
# OR in case of conda environment
conda install -c conda-forge radical.pilot
-MongoDB
--------
-
-MongoDB service is **not** provided by Bridges2, thus, you have to use either
-your running instance of MongoDB service or contact the RADICAL team by opening
-a `ticket `_.
-
-RADICAL-Pilot will connect to the MongoDB instance using a corresponding URI:
-
-.. code-block:: bash
-
- export RADICAL_PILOT_DBURL=""
Launching script example
========================
@@ -110,7 +98,6 @@ launching command for the application itself.
module load python
source ve.rp/bin/activate
- export RADICAL_PILOT_DBURL="mongodb://localhost:27017/"
export RADICAL_PROFILE=TRUE
# for debugging purposes
export RADICAL_LOG_LVL=DEBUG
diff --git a/docs/source/supported/delta.rst b/docs/source/supported/delta.rst
index b4da0f0168..8f97277e32 100644
--- a/docs/source/supported/delta.rst
+++ b/docs/source/supported/delta.rst
@@ -81,18 +81,6 @@ Install RADICAL-Pilot after activating a corresponding virtual environment:
Polaris does not provide virtual environments with ``conda``.
-MongoDB
--------
-
-MongoDB service is **not** provided by NCSA, thus, you have to use either your
-running instance of MongoDB service or contact the RADICAL team by opening a
-`ticket `_.
-
-RADICAL-Pilot will connect to the MongoDB instance using a corresponding URI:
-
-.. code-block:: bash
-
- export RADICAL_PILOT_DBURL=""
Launching script example
========================
@@ -109,7 +97,6 @@ launching command for the application itself.
module load python
source ve.rp/bin/activate
- export RADICAL_PILOT_DBURL="mongodb://localhost:27017/"
export RADICAL_PROFILE=TRUE
# for debugging purposes
export RADICAL_LOG_LVL=DEBUG
diff --git a/docs/source/supported/frontera.rst b/docs/source/supported/frontera.rst
index a48c3011c7..49f7d2a356 100644
--- a/docs/source/supported/frontera.rst
+++ b/docs/source/supported/frontera.rst
@@ -58,18 +58,6 @@ Install RADICAL-Pilot after activating a corresponding virtual environment:
pip install radical.pilot
-MongoDB
--------
-
-MongoDB service is **not** provided by TACC, thus, you have to use either your
-running instance of MongoDB service or contact the RADICAL team by opening a
-`ticket `_.
-
-RADICAL-Pilot will connect to the MongoDB instance using a corresponding URI:
-
-.. code-block:: bash
-
- export RADICAL_PILOT_DBURL=""
Launching script example
========================
@@ -86,7 +74,6 @@ launching command for the application itself.
module load python3
source ve.rp/bin/activate
- export RADICAL_PILOT_DBURL="mongodb://localhost:27017/"
export RADICAL_PROFILE=TRUE
# for debugging purposes
export RADICAL_LOG_LVL=DEBUG
diff --git a/docs/source/supported/frontier.rst b/docs/source/supported/frontier.rst
index a4c2dd1c80..1da00ce6dc 100644
--- a/docs/source/supported/frontier.rst
+++ b/docs/source/supported/frontier.rst
@@ -100,26 +100,11 @@ Install RADICAL-Pilot after activating a corresponding virtual environment:
.. code-block:: bash
pip install radical.pilot
-
+
.. note::
Frontier does not provide virtual environments with ``conda``.
-MongoDB
--------
-
-OLCF provides a MongoDB service via
-`Slate `_,
-an infrastructure built on Kubernetes and OpenShift. Please ask the RADICAL team for a
-corresponding MongoDB URI by opening a
-`ticket `_.
-
-RADICAL-Pilot will connect to the MongoDB instance using the provided URI.
-
-.. code-block:: bash
-
- export RADICAL_PILOT_DBURL=""
-
Launching script example
========================
@@ -135,7 +120,6 @@ launching command for the application itself.
module load cray-python
source ve.rp/bin/activate
- export RADICAL_PILOT_DBURL="mongodb://localhost:27017/"
export RADICAL_PROFILE=TRUE
# for debugging purposes
export RADICAL_LOG_LVL=DEBUG
diff --git a/docs/source/supported/perlmutter.rst b/docs/source/supported/perlmutter.rst
index 0a1e79d529..1a2290ba12 100644
--- a/docs/source/supported/perlmutter.rst
+++ b/docs/source/supported/perlmutter.rst
@@ -86,19 +86,6 @@ Install RADICAL-Pilot after activating a corresponding virtual environment:
# OR in case of conda environment
conda install -c conda-forge radical.pilot
-MongoDB
--------
-
-NERSC provides `database services `_,
-including MongoDB. You need to fill out a form to request a database instance -
-https://docs.nersc.gov/services/databases/#requesting-a-database.
-
-RADICAL-Pilot will connect to the MongoDB instance using a corresponding URI:
-
-.. code-block:: bash
-
- export RADICAL_PILOT_DBURL=""
-
Launching script example
========================
@@ -114,7 +101,6 @@ launching command for the application itself.
module load python
source ve.rp/bin/activate
- export RADICAL_PILOT_DBURL="mongodb://localhost:27017/"
export RADICAL_PROFILE=TRUE
# for debugging purposes
export RADICAL_LOG_LVL=DEBUG
diff --git a/docs/source/supported/polaris.rst b/docs/source/supported/polaris.rst
index 78bfd5aba4..b72be3da51 100644
--- a/docs/source/supported/polaris.rst
+++ b/docs/source/supported/polaris.rst
@@ -73,91 +73,6 @@ Install RADICAL-Pilot after activating a corresponding virtual environment:
# OR in case of conda environment
conda install -c conda-forge radical.pilot
-MongoDB
--------
-
-Local installation
-^^^^^^^^^^^^^^^^^^
-
-If MongoDB was already setup and initialized then just run its instance
-(see `Run MongoDB instance <#run-mongodb-instance>`_ subsection).
-
-.. code-block:: bash
-
- cd $HOME
- wget https://downloads.mongodb.com/linux/mongodb-linux-x86_64-enterprise-suse15-4.4.0.tgz
- tar -zxf mongodb-linux-x86_64-enterprise-suse15-4.4.0.tgz
- mv mongodb-linux-x86_64-enterprise-suse15-4.4.0 mongo
- mkdir -p mongo/data mongo/etc mongo/var/log mongo/var/run
- touch mongo/var/log/mongodb.log
-
-Config setup
-^^^^^^^^^^^^
-
-Description of the MongoDB setup is provided in this
-`user guide `_,
-which is the same for all ALCF platforms.
-
-.. code-block:: bash
-
- cat > mongo/etc/mongodb.polaris.conf < use rct_db
- > db.createUser({user: "rct", pwd: "jdWeRT634k", roles: ["readWrite"]})
- > exit
-
-RADICAL-Pilot will connect to the MongoDB instance using the following URI.
-
-.. code-block:: bash
-
- export RADICAL_PILOT_DBURL="mongodb://rct:jdWeRT634k@`hostname -f`:54937/rct_db"
-
Launching script example
========================
@@ -175,9 +90,6 @@ environment with ``conda``.
eval "$(conda shell.posix hook)"
conda activate ve.rp
- $HOME/mongo/bin/mongod -f $HOME/mongo/etc/mongodb.polaris.conf
-
- export RADICAL_PILOT_DBURL="mongodb://rct:jdWeRT634k@`hostname -f`:54937/rct_db"
export RADICAL_PROFILE=TRUE
# for debugging purposes
export RADICAL_LOG_LVL=DEBUG
@@ -185,9 +97,6 @@ environment with ``conda``.
# - run -
python
- # - post run -
- $HOME/mongo/bin/mongod -f $HOME/mongo/etc/mongodb.polaris.conf --shutdown
-
Execute launching script as ``./rp_launcher.sh`` or run it in the background:
.. code-block:: bash
diff --git a/docs/source/supported/rivanna.rst b/docs/source/supported/rivanna.rst
index ba394856d4..12305e2433 100644
--- a/docs/source/supported/rivanna.rst
+++ b/docs/source/supported/rivanna.rst
@@ -74,19 +74,6 @@ Install RADICAL-Pilot after activating a corresponding virtual environment:
Rivanna does not provide virtual environments with ``conda``.
-MongoDB
--------
-
-MongoDB service is **not** provided by UVA, thus, you have to use either your
-running instance of MongoDB service or contact the RADICAL team by opening a
-`ticket `_.
-
-RADICAL-Pilot will connect to the MongoDB instance using a corresponding URL.
-
-.. code-block:: bash
-
- export RADICAL_PILOT_DBURL=""
-
Launching script example
========================
@@ -102,7 +89,6 @@ launching command for the application itself.
module load python
source ve.rp/bin/activate
- export RADICAL_PILOT_DBURL="mongodb://localhost:27017/"
export RADICAL_PROFILE=TRUE
# for debugging purposes
export RADICAL_LOG_LVL=DEBUG
diff --git a/docs/source/supported/summit.rst b/docs/source/supported/summit.rst
index 53dfe452e1..51e540fc47 100644
--- a/docs/source/supported/summit.rst
+++ b/docs/source/supported/summit.rst
@@ -37,7 +37,7 @@ General description
.. note::
Launch method ``MPIRUN`` is able to see only one hardware-thread per core,
- thus make sure that ``SMT`` level is set to ``1`` with a corresponding
+ thus make sure that ``SMT`` level is set to ``1`` with a corresponding
platform ID either with ``export RADICAL_SMT=1`` (before running the
application) or follow the steps below:
@@ -120,21 +120,6 @@ Install RADICAL-Pilot after activating a corresponding virtual environment:
# OR in case of conda environment
conda install -c conda-forge radical.pilot
-MongoDB
--------
-
-OLCF provides a MongoDB service via
-`Slate `_,
-an infrastructure built on Kubernetes and OpenShift. Please ask the RADICAL team for a
-corresponding MongoDB URI by opening a
-`ticket `_.
-
-RADICAL-Pilot will connect to the MongoDB instance using the provided URI.
-
-.. code-block:: bash
-
- export RADICAL_PILOT_DBURL=""
-
Launching script example
========================
@@ -151,7 +136,6 @@ launching command for the application itself.
eval "$(conda shell.posix hook)"
conda activate ve.rp
- export RADICAL_PILOT_DBURL="mongodb://localhost:27017/"
export RADICAL_PROFILE=TRUE
# for debugging purposes
export RADICAL_LOG_LVL=DEBUG
diff --git a/docs/source/testing.rst b/docs/source/testing.rst
new file mode 100644
index 0000000000..531284a978
--- /dev/null
+++ b/docs/source/testing.rst
@@ -0,0 +1,113 @@
+
+.. _chapter_testing:
+
+*******
+Testing
+*******
+
+Introduction
+============
+
+Along with RADICAL-Pilot functionalities, we are developing a growing set of
+unit tests. The source code of the unit tests can be found in
+``src/radical/pilot/tests``. You can run the unit tests via `pytest`:
+
+.. code-block:: bash
+
+ export RADICAL_PILOT_LOG_LVL=debug
+ pytest tests/
+
+
+Remote Testing
+==============
+
+.. warning::
+
+ Remote Testing is disabled in the current release!
+
+
+By default, the unit tests of RADICAL-Pilot use pilot agents launched on the
+local machine (`localhost`). However, it is possible to run a subset of the
+unit tests (``src/radical/pilot/tests/remote/``) on a remote machine. Remote
+testing can be controlled via a set of environment variables:
+
++-------------------------------------------+-------------------------------------+
+| Environment Variable | What |
++===========================================+=====================================+
+| ``RADICAL_PILOT_TEST_REMOTE_RESOURCE`` | Name (key) of the resource. |
++-------------------------------------------+-------------------------------------+
+| ``RADICAL_PILOT_TEST_REMOTE_SSH_USER_ID`` | User ID on the remote system. |
++-------------------------------------------+-------------------------------------+
+| ``RADICAL_PILOT_TEST_REMOTE_SSH_USER_KEY``| SSH key to use for the connection. |
++-------------------------------------------+-------------------------------------+
+| ``RADICAL_PILOT_TEST_REMOTE_WORKDIR`` | Work directory on the remote system.|
++-------------------------------------------+-------------------------------------+
+| ``RADICAL_PILOT_TEST_REMOTE_CORES`` | Number of cores to allocate. |
++-------------------------------------------+-------------------------------------+
+| ``RADICAL_PILOT_TEST_REMOTE_NUM_CUS`` | Number of Tasks to run. |
++-------------------------------------------+-------------------------------------+
+| ``RADICAL_PILOT_TEST_TIMEOUT`` | Test timeout in minutes. |
++-------------------------------------------+-------------------------------------+
+
+
+For example, if you want to run the unit tests on the XSEDE/ACCESS _Bridges_ cluster
+(https://portal.xsede.org/psc-bridges), run
+
+.. code-block:: bash
+
+ RADICAL_PILOT_LOG_LVl=DEBUG \
+ RADICAL_PILOT_TEST_REMOTE_SSH_USER_ID= \ # optional
+ RADICAL_PILOT_TEST_REMOTE_RESOURCE=access.bridges \
+ RADICAL_PILOT_TEST_REMOTE_WORKDIR= \
+ RADICAL_PILOT_TEST_REMOTE_CORES=16 \
+ RADICAL_PILOT_TEST_REMOTE_NUM_CUS=64 \
+ python setup.py test
+
+.. note::
+
+ Be aware that it can take quite some time for pilots to get scheduled on
+ the remote system. You can set ``RADICAL_PILOT_TEST_TIMEOUT`` to force the tests
+ to abort after a given number of minutes.
+
+
+Adding New Tests
+================
+
+If you want to add a new tests, for example to reproduce an error that you have
+encountered, please follow this procedure:
+
+In the ``tests/issues/`` directory, create a new file. If applicable, name it
+after the issues number in the RADICAL-Pilot
+`issues tracker `_,
+e.g., ``issue_123.py``.
+
+The content of the file should look like this (make sure to change the class
+name):
+
+.. code-block:: python
+
+ import sys
+ import radical.pilot
+
+ #-----------------------------------------------------------------------------
+ #
+ class TestIssue123(object):
+
+ #-------------------------------------------------------------------------
+ #
+ def test_issue_123_part_1(self):
+ """ https://github.com/radical-cybertools/radical.pilot/issues/123
+ """
+ session = radical.pilot.Session()
+
+ # Your test implementation
+
+ session.close()
+
+Now you can re-install RADICAL-Pilot and run your new test. In the source root,
+run:
+
+.. code-block:: python
+
+ pip install --upgrade .
+ pytest -v tests/issues/issue_123::TestIssue123
diff --git a/docs/source/tutorials/configuration.ipynb b/docs/source/tutorials/configuration.ipynb
index ca7e43ecc7..40e93cd92c 100644
--- a/docs/source/tutorials/configuration.ipynb
+++ b/docs/source/tutorials/configuration.ipynb
@@ -130,22 +130,21 @@
" \"description\" : \"Short description of the resource\",\n",
" \"notes\" : \"Notes about resource usage\",\n",
"\n",
- " \"schemas\" : [\"local\", \"ssh\", \"batch\", \"interactive\"],\n",
- " \"local\" :\n",
- " {\n",
- " \"job_manager_endpoint\" : \"slurm://frontera.tacc.utexas.edu/\",\n",
- " \"filesystem_endpoint\" : \"file://frontera.tacc.utexas.edu/\"\n",
- " },\n",
- " \"ssh\" :\n",
- " {\n",
- " \"job_manager_endpoint\" : \"slurm+ssh://frontera.tacc.utexas.edu/\",\n",
- " \"filesystem_endpoint\" : \"sftp://frontera.tacc.utexas.edu/\"\n",
- " },\n",
- " \"batch\" : \"interactive\",\n",
- " \"interactive\" :\n",
- " {\n",
- " \"job_manager_endpoint\" : \"fork://localhost/\",\n",
- " \"filesystem_endpoint\" : \"file://localhost/\"\n",
+ " \"default_schema\" : \"local\",\n",
+ " \"schemas\" : {\n",
+ " \"local\" : {\n",
+ " \"job_manager_endpoint\": \"slurm://frontera.tacc.utexas.edu/\",\n",
+ " \"filesystem_endpoint\" : \"file://frontera.tacc.utexas.edu/\"\n",
+ " },\n",
+ " \"ssh\" : {\n",
+ " \"job_manager_endpoint\": \"slurm+ssh://frontera.tacc.utexas.edu/\",\n",
+ " \"filesystem_endpoint\" : \"sftp://frontera.tacc.utexas.edu/\"\n",
+ " },\n",
+ " \"batch\" : \"interactive\",\n",
+ " \"interactive\" : {\n",
+ " \"job_manager_endpoint\": \"fork://localhost/\",\n",
+ " \"filesystem_endpoint\" : \"file://localhost/\"\n",
+ " },\n",
" },\n",
"\n",
" \"default_queue\" : \"production\",\n",
@@ -235,12 +234,6 @@
"## Examples\n",
"\n",
"\n",
- " \n",
- "__Note:__ For the initial setup regarding MongoDB see the tutorial [Getting Started](../getting_started.ipynb).\n",
- "\n",
- "
\n",
- "\n",
- "\n",
"\n",
"__Note:__ In our examples, we will not show a progression bar while waiting for some operation to complete, e.g., while waiting for a pilot to stop. That is because the progression bar offered by RP's reporter does not work within a notebook. You could use it when executing an RP application as a standalone Python script.\n",
"\n",
@@ -404,10 +397,10 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[94mnew session: \u001b[39m\u001b[0m[rp.session.three.mturilli.019495.0003]\u001b[39m\u001b[0m\u001b[94m \\\n",
- "database : \u001b[39m\u001b[0m[mongodb://rct-tutorial:****@95.217.193.116:27017/rct-tutorial]\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m\u001b[94mcreate pilot manager\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m"
+ "\u001B[94mnew session: \u001B[39m\u001B[0m[rp.session.three.mturilli.019495.0003]\u001B[39m\u001B[0m\u001B[94m \\\n",
+ "database : \u001B[39m\u001B[0m[mongodb://rct-tutorial:****@95.217.193.116:27017/rct-tutorial]\u001B[39m\u001B[0m\u001B[92m ok\n",
+ "\u001B[39m\u001B[0m\u001B[94mcreate pilot manager\u001B[39m\u001B[0m\u001B[92m ok\n",
+ "\u001B[39m\u001B[0m"
]
}
],
@@ -469,9 +462,9 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[94msubmit 1 pilot(s)\u001b[39m\u001b[0m\n",
- " pilot.0000 tacc.frontera_tutorial 56 cores 0 gpus\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m"
+ "\u001B[94msubmit 1 pilot(s)\u001B[39m\u001B[0m\n",
+ " pilot.0000 tacc.frontera_tutorial 56 cores 0 gpus\u001B[39m\u001B[0m\u001B[92m ok\n",
+ "\u001B[39m\u001B[0m"
]
}
],
@@ -576,13 +569,13 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "\u001b[94mclosing session rp.session.three.mturilli.019495.0003\u001b[39m\u001b[0m\u001b[94m \\\n",
- "close pilot manager\u001b[39m\u001b[0m\u001b[94m \\\n",
+ "\u001B[94mclosing session rp.session.three.mturilli.019495.0003\u001B[39m\u001B[0m\u001B[94m \\\n",
+ "close pilot manager\u001B[39m\u001B[0m\u001B[94m \\\n",
"wait for 1 pilot(s)\n",
- " \u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m\u001b[94msession lifetime: 13.1s\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m"
+ " \u001B[39m\u001B[0m\u001B[92m ok\n",
+ "\u001B[39m\u001B[0m\u001B[92m ok\n",
+ "\u001B[39m\u001B[0m\u001B[94msession lifetime: 13.1s\u001B[39m\u001B[0m\u001B[92m ok\n",
+ "\u001B[39m\u001B[0m"
]
}
],
diff --git a/docs/source/tutorials/describing_tasks.ipynb b/docs/source/tutorials/describing_tasks.ipynb
index 7532303788..3b1134ddd3 100644
--- a/docs/source/tutorials/describing_tasks.ipynb
+++ b/docs/source/tutorials/describing_tasks.ipynb
@@ -1,7 +1,6 @@
{
"cells": [
{
- "attachments": {},
"cell_type": "markdown",
"id": "67521807",
"metadata": {},
@@ -22,32 +21,35 @@
"\n",
"
\n",
"\n",
+ "Let's have a quick check that we have MPI launch method installed. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "567c0f26-3e35-44d3-a81a-ab89a79a3dcd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import radical.utils as ru\n",
+ "\n",
+ "mpi_lm_exists = bool(ru.which(['mpirun', 'mpiexec']))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ec11a436-b9a8-4706-a126-d5f1ba19bd41",
+ "metadata": {},
+ "source": [
"First, some preparatory work for the tutorial. We import some modules and set some variables. Note that we `import radical.pilot as rp` so to abbreviate future API calls. "
]
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"id": "c8b8387d",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2023-05-18T01:26:46.316432Z",
- "iopub.status.busy": "2023-05-18T01:26:46.316106Z",
- "iopub.status.idle": "2023-05-18T01:26:46.451071Z",
- "shell.execute_reply": "2023-05-18T01:26:46.450250Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'/home/mturilli/ve-notebooks'"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
+ "metadata": {},
+ "outputs": [],
"source": [
"import os\n",
"import sys\n",
@@ -57,15 +59,13 @@
"os.environ['RADICAL_REPORT_ANIME'] = 'False'\n",
"\n",
"import radical.pilot as rp\n",
- "import radical.utils as ru\n",
"\n",
"# determine the path of the currently active virtualenv to simplify some examples below\n",
"ve_path = os.path.dirname(os.path.dirname(ru.which('python3')))\n",
- "display(ve_path)\n"
+ "display(ve_path)"
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "97ab1560",
"metadata": {},
@@ -77,36 +77,10 @@
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"id": "7e4566d0",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2023-05-18T01:26:46.455734Z",
- "iopub.status.busy": "2023-05-18T01:26:46.455532Z",
- "iopub.status.idle": "2023-05-18T01:27:19.693837Z",
- "shell.execute_reply": "2023-05-18T01:27:19.692492Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[94m\u001b[1m\n",
- "\u001b[39m\u001b[0m\u001b[94m\u001b[1m================================================================================\n",
- "\u001b[39m\u001b[0m\u001b[94m\u001b[1m Tutorial: Describing Tasks (RP version 1.34.0) \n",
- "\u001b[39m\u001b[0m\u001b[94m\u001b[1m================================================================================\n",
- "\u001b[39m\u001b[0m\u001b[94m\u001b[1m\n",
- "\u001b[39m\u001b[0m\u001b[94mnew session: \u001b[39m\u001b[0m[rp.session.three.mturilli.019495.0002]\u001b[39m\u001b[0m\u001b[94m \\\n",
- "database : \u001b[39m\u001b[0m[mongodb://rct-tutorial:****@95.217.193.116:27017/rct-tutorial]\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m\u001b[94mcreate pilot manager\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m\u001b[94mcreate task manager\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m\u001b[94msubmit 1 pilot(s)\u001b[39m\u001b[0m\n",
- " pilot.0000 local.localhost 32 cores 1 gpus\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m\u001b[94mpilot state: PMGR_ACTIVE\u001b[39m\u001b[0m"
- ]
- }
- ],
+ "metadata": {},
+ "outputs": [],
"source": [
"# configure reporter output \n",
"report = ru.Reporter(name='radical.pilot')\n",
@@ -131,7 +105,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "1ce411cb",
"metadata": {},
@@ -155,34 +128,17 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"id": "1ba782cd",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2023-05-18T01:27:19.698223Z",
- "iopub.status.busy": "2023-05-18T01:27:19.697879Z",
- "iopub.status.idle": "2023-05-18T01:27:19.742619Z",
- "shell.execute_reply": "2023-05-18T01:27:19.741824Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "submit: \u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m\n",
- "\u001b[39m\u001b[0m"
- ]
- }
- ],
+ "metadata": {},
+ "outputs": [],
"source": [
"# create a minimal executable task\n",
"td = rp.TaskDescription({'executable': '/bin/date'})\n",
- "task = tmgr.submit_tasks(td)\n"
+ "task = tmgr.submit_tasks(td)"
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "03112275",
"metadata": {},
@@ -192,44 +148,15 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": null,
"id": "5f2ea29b",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2023-05-18T01:27:19.745416Z",
- "iopub.status.busy": "2023-05-18T01:27:19.745184Z",
- "iopub.status.idle": "2023-05-18T01:27:25.293949Z",
- "shell.execute_reply": "2023-05-18T01:27:25.293306Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "wait : \u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m\n",
- "\u001b[39m\u001b[0m\u001b[94m\tDONE : 1\n",
- "\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m"
- ]
- },
- {
- "data": {
- "text/plain": [
- "['DONE']"
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "metadata": {},
+ "outputs": [],
"source": [
"tmgr.wait_tasks()"
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "9efbbe7a",
"metadata": {},
@@ -249,44 +176,10 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": null,
"id": "e7a7d0ac",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2023-05-18T01:27:25.296464Z",
- "iopub.status.busy": "2023-05-18T01:27:25.296248Z",
- "iopub.status.idle": "2023-05-18T01:27:25.315995Z",
- "shell.execute_reply": "2023-05-18T01:27:25.315222Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "uid : task.000000\n",
- "\u001b[39m\u001b[0mtmgr : tmgr.0000\n",
- "\u001b[39m\u001b[0mpilot : pilot.0000\n",
- "\u001b[39m\u001b[0mname : \n",
- "\u001b[39m\u001b[0mexecutable : /bin/date\n",
- "\u001b[39m\u001b[0mstate : DONE\n",
- "\u001b[39m\u001b[0mexit_code : 0\n",
- "\u001b[39m\u001b[0mstdout : Thu May 18 03:27:23 AM CEST 2023\n",
- "\u001b[39m\u001b[0mstderr : \n",
- "\u001b[39m\u001b[0mreturn_value : None\n",
- "\u001b[39m\u001b[0mexception : None\n",
- "\u001b[39m\u001b[0m\n",
- "\u001b[39m\u001b[0mendpoint_fs : file://localhost/\n",
- "\u001b[39m\u001b[0mresource_sandbox: file://localhost/home/mturilli/radical.pilot.sandbox\n",
- "\u001b[39m\u001b[0msession_sandbox : file://localhost/home/mturilli/radical.pilot.sandbox/rp.session.three.mturilli.019495.0002\n",
- "\u001b[39m\u001b[0mpilot_sandbox : file://localhost/home/mturilli/radical.pilot.sandbox/rp.session.three.mturilli.019495.0002/pilot.0000/\n",
- "\u001b[39m\u001b[0mtask_sandbox : file://localhost/home/mturilli/radical.pilot.sandbox/rp.session.three.mturilli.019495.0002/pilot.0000/task.000000/\n",
- "\u001b[39m\u001b[0mclient_sandbox : /home/mturilli/github/radical.pilot/docs/source/tutorials\n",
- "\u001b[39m\u001b[0mmetadata : None\n",
- "\u001b[39m\u001b[0m"
- ]
- }
- ],
+ "metadata": {},
+ "outputs": [],
"source": [
"report.plain('uid : %s\\n' % task.uid)\n",
"report.plain('tmgr : %s\\n' % task.tmgr.uid)\n",
@@ -306,11 +199,10 @@
"report.plain('pilot_sandbox : %s\\n' % task.pilot_sandbox)\n",
"report.plain('task_sandbox : %s\\n' % task.task_sandbox)\n",
"report.plain('client_sandbox : %s\\n' % task.client_sandbox)\n",
- "report.plain('metadata : %s\\n' % task.metadata)\n"
+ "report.plain('metadata : %s\\n' % task.metadata)"
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "7273c6ea",
"metadata": {},
@@ -328,27 +220,10 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": null,
"id": "32c95d9a",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2023-05-18T01:27:25.319462Z",
- "iopub.status.busy": "2023-05-18T01:27:25.319172Z",
- "iopub.status.idle": "2023-05-18T01:27:25.439540Z",
- "shell.execute_reply": "2023-05-18T01:27:25.438841Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "create: \u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m\n",
- "\u001b[39m\u001b[0msubmit: \u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m\n",
- "\u001b[39m\u001b[0m"
- ]
- }
- ],
+ "metadata": {},
+ "outputs": [],
"source": [
"import string\n",
"letters = string.ascii_lowercase + string.ascii_uppercase\n",
@@ -367,7 +242,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "187dbca6",
"metadata": {},
@@ -377,38 +251,10 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": null,
"id": "fa13837b",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2023-05-18T01:27:25.441944Z",
- "iopub.status.busy": "2023-05-18T01:27:25.441799Z",
- "iopub.status.idle": "2023-05-18T01:27:31.296128Z",
- "shell.execute_reply": "2023-05-18T01:27:31.295235Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "wait : \u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m\n",
- "\u001b[39m\u001b[0m\u001b[94m\tDONE : 3\n",
- "\u001b[39m\u001b[0m\u001b[94m\tFAILED : 49\n",
- "\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "task.000021: ['-u']: Thu May 18 01:27:29 AM UTC 2023\n",
- "task.000035: ['-I']: 2023-05-18\n",
- "task.000044: ['-R']: Thu, 18 May 2023 03:27:29 +0200\n"
- ]
- }
- ],
+ "metadata": {},
+ "outputs": [],
"source": [
"tmgr.wait_tasks([task.uid for task in tasks])\n",
"\n",
@@ -418,7 +264,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "0cc12709",
"metadata": {},
@@ -428,152 +273,28 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": null,
"id": "a3708cb3",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2023-05-18T01:27:31.299523Z",
- "iopub.status.busy": "2023-05-18T01:27:31.298749Z",
- "iopub.status.idle": "2023-05-18T01:27:31.447222Z",
- "shell.execute_reply": "2023-05-18T01:27:31.446657Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "wait : \u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m\n",
- "\u001b[39m\u001b[0m\u001b[94m\tDONE : 3\n",
- "\u001b[39m\u001b[0m\u001b[94m\tFAILED : 49\n",
- "\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "task.000001: ['-a']: /bin/date: invalid option -- 'a'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000002: ['-b']: /bin/date: invalid option -- 'b'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000003: ['-c']: /bin/date: invalid option -- 'c'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000004: ['-d']: /bin/date: option requires an argument -- 'd'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000005: ['-e']: /bin/date: invalid option -- 'e'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000006: ['-f']: /bin/date: option requires an argument -- 'f'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000007: ['-g']: /bin/date: invalid option -- 'g'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000008: ['-h']: /bin/date: invalid option -- 'h'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000009: ['-i']: /bin/date: invalid option -- 'i'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000010: ['-j']: /bin/date: invalid option -- 'j'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000011: ['-k']: /bin/date: invalid option -- 'k'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000012: ['-l']: /bin/date: invalid option -- 'l'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000013: ['-m']: /bin/date: invalid option -- 'm'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000014: ['-n']: /bin/date: invalid option -- 'n'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000015: ['-o']: /bin/date: invalid option -- 'o'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000016: ['-p']: /bin/date: invalid option -- 'p'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000017: ['-q']: /bin/date: invalid option -- 'q'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000018: ['-r']: /bin/date: option requires an argument -- 'r'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000019: ['-s']: /bin/date: option requires an argument -- 's'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000020: ['-t']: /bin/date: invalid option -- 't'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000022: ['-v']: /bin/date: invalid option -- 'v'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000023: ['-w']: /bin/date: invalid option -- 'w'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000024: ['-x']: /bin/date: invalid option -- 'x'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000025: ['-y']: /bin/date: invalid option -- 'y'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000026: ['-z']: /bin/date: invalid option -- 'z'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000027: ['-A']: /bin/date: invalid option -- 'A'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000028: ['-B']: /bin/date: invalid option -- 'B'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000029: ['-C']: /bin/date: invalid option -- 'C'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000030: ['-D']: /bin/date: invalid option -- 'D'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000031: ['-E']: /bin/date: invalid option -- 'E'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000032: ['-F']: /bin/date: invalid option -- 'F'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000033: ['-G']: /bin/date: invalid option -- 'G'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000034: ['-H']: /bin/date: invalid option -- 'H'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000036: ['-J']: /bin/date: invalid option -- 'J'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000037: ['-K']: /bin/date: invalid option -- 'K'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000038: ['-L']: /bin/date: invalid option -- 'L'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000039: ['-M']: /bin/date: invalid option -- 'M'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000040: ['-N']: /bin/date: invalid option -- 'N'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000041: ['-O']: /bin/date: invalid option -- 'O'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000042: ['-P']: /bin/date: invalid option -- 'P'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000043: ['-Q']: /bin/date: invalid option -- 'Q'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000045: ['-S']: /bin/date: invalid option -- 'S'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000046: ['-T']: /bin/date: invalid option -- 'T'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000047: ['-U']: /bin/date: invalid option -- 'U'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000048: ['-V']: /bin/date: invalid option -- 'V'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000049: ['-W']: /bin/date: invalid option -- 'W'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000050: ['-X']: /bin/date: invalid option -- 'X'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000051: ['-Y']: /bin/date: invalid option -- 'Y'\n",
- "Try '/bin/date --help' for more information.\n",
- "task.000052: ['-Z']: /bin/date: invalid option -- 'Z'\n",
- "Try '/bin/date --help' for more information.\n"
- ]
- }
- ],
+ "metadata": {},
+ "outputs": [],
"source": [
"tmgr.wait_tasks([task.uid for task in tasks])\n",
"\n",
"for task in tasks:\n",
" if task.state == rp.FAILED:\n",
- " print('%s: %s: %s' % (task.uid, task.description['arguments'], task.stderr.strip()))\n"
+ " print('%s: %s: %s' % (task.uid, task.description['arguments'], task.stderr.strip()))"
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "82299910",
"metadata": {},
"source": [
"## MPI Tasks and Task Resources\n",
"\n",
- "So far, we run single-core tasks. The most common way for application to utilize multiple cores and nodes on HPC machines is to use MPI as communication layer which coordinates multiple application processes, i.e., MPI ranks. In fact, the notion of `ranks` is central to RP's `TaskDescription` class. All MPI ranks will be near-exact copies of each other: they run in the same work directory and the same `environment`, are defined by the same `executable` and `arguments`, get the same amount of resources allocated, etc. Notable exceptions are:\n",
+ "So far, we run single-core tasks. The most common way for application to utilize multiple cores and nodes on HPC machines is to use MPI as a communication layer, which coordinates multiple application processes, i.e., MPI ranks. In fact, the notion of `ranks` is central to RP's `TaskDescription` class. All MPI ranks will be near-exact copies of each other: they run in the same work directory and the same `environment`, are defined by the same `executable` and `arguments`, get the same amount of resources allocated, etc. Notable exceptions are:\n",
"\n",
- " - Rank processes may run on different nodes;\n",
+ " - rank processes may run on different nodes;\n",
" - rank processes can communicate via MPI;\n",
" - each rank process obtains a unique rank ID.\n",
"\n",
@@ -601,125 +322,28 @@
"\n",
"__Note:__ No core pinning is performed on localhost. Thus, tasks see all CPU cores as available to them. However, the `THREADS` information still reports the correct number of assigned CPU cores.\n",
"\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "__Note:__ If there is no MPI launch method installed, then we will proceed with a single rank.\n",
+ "\n",
"
"
]
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": null,
"id": "9047b209",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2023-05-18T01:27:31.450159Z",
- "iopub.status.busy": "2023-05-18T01:27:31.449859Z",
- "iopub.status.idle": "2023-05-18T01:27:39.266337Z",
- "shell.execute_reply": "2023-05-18T01:27:39.265474Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- ".\u001b[39m\u001b[0m.\u001b[39m\u001b[0m.\u001b[39m\u001b[0m.\u001b[39m\u001b[0m\n",
- "\u001b[39m\u001b[0msubmit: \u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m\n",
- "\u001b[39m\u001b[0mwait : \u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m\n",
- "\u001b[39m\u001b[0m\u001b[94m\tDONE : 4\n",
- "\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "--- task.000053:\n",
- "0 : PID : 1284029\n",
- "0 : NODE : three\n",
- "0 : CPUS : 1111111111111111111111111111111111111111111111111111111111111111\n",
- "0 : GPUS : 0\n",
- "0 : RANK : 0\n",
- "0 : THREADS : 1\n",
- "0 : SLEEP : 1\n",
- "\n",
- "--- task.000054:\n",
- "0 : PID : 1284080\n",
- "0 : NODE : three\n",
- "0 : CPUS : 0000000000000000000000000000000100000000000000000000000000000001\n",
- "0 : GPUS : 0\n",
- "0 : RANK : 0\n",
- "0 : THREADS : 2\n",
- "0 : SLEEP : 2\n",
- "1 : PID : 1284086\n",
- "1 : NODE : three\n",
- "1 : CPUS : 0000000000000000000000000000001000000000000000000000000000000010\n",
- "1 : GPUS : 0\n",
- "1 : RANK : 1\n",
- "1 : THREADS : 2\n",
- "1 : SLEEP : 2\n",
- "\n",
- "--- task.000055:\n",
- "1 : PID : 1284190\n",
- "1 : NODE : three\n",
- "1 : CPUS : 1111111111111111111111111111111111111111111111111111111111111111\n",
- "1 : GPUS : 0\n",
- "1 : RANK : 1\n",
- "1 : THREADS : 3\n",
- "1 : SLEEP : 3\n",
- "2 : PID : 1284205\n",
- "2 : NODE : three\n",
- "2 : CPUS : 1111111111111111111111111111111111111111111111111111111111111111\n",
- "2 : GPUS : 0\n",
- "2 : RANK : 2\n",
- "2 : THREADS : 3\n",
- "2 : SLEEP : 3\n",
- "0 : PID : 1284167\n",
- "0 : NODE : three\n",
- "0 : CPUS : 1111111111111111111111111111111111111111111111111111111111111111\n",
- "0 : GPUS : 0\n",
- "0 : RANK : 0\n",
- "0 : THREADS : 3\n",
- "0 : SLEEP : 3\n",
- "\n",
- "--- task.000056:\n",
- "3 : PID : 1284214\n",
- "3 : NODE : three\n",
- "3 : CPUS : 1111111111111111111111111111111111111111111111111111111111111111\n",
- "3 : GPUS : 0\n",
- "3 : RANK : 3\n",
- "3 : THREADS : 4\n",
- "3 : SLEEP : 4\n",
- "0 : PID : 1284157\n",
- "0 : NODE : three\n",
- "0 : CPUS : 1111111111111111111111111111111111111111111111111111111111111111\n",
- "0 : GPUS : 0\n",
- "0 : RANK : 0\n",
- "0 : THREADS : 4\n",
- "0 : SLEEP : 4\n",
- "1 : PID : 1284180\n",
- "1 : NODE : three\n",
- "1 : CPUS : 1111111111111111111111111111111111111111111111111111111111111111\n",
- "1 : GPUS : 0\n",
- "1 : RANK : 1\n",
- "1 : THREADS : 4\n",
- "1 : SLEEP : 4\n",
- "2 : PID : 1284192\n",
- "2 : NODE : three\n",
- "2 : CPUS : 1111111111111111111111111111111111111111111111111111111111111111\n",
- "2 : GPUS : 0\n",
- "2 : RANK : 2\n",
- "2 : THREADS : 4\n",
- "2 : SLEEP : 4\n",
- "\n"
- ]
- }
- ],
+ "metadata": {},
+ "outputs": [],
"source": [
"tds = list()\n",
"for n in range(4):\n",
+ " ranks = (n + 1) if mpi_lm_exists else 1\n",
" tds.append(rp.TaskDescription({'executable' : ve_path + '/bin/radical-pilot-hello.sh',\n",
" 'arguments' : [n + 1], \n",
- " 'ranks' : (n + 1), \n",
+ " 'ranks' : ranks, \n",
" 'cores_per_rank': (n + 1),\n",
" 'threading_type': rp.OpenMP}))\n",
" report.progress()\n",
@@ -730,11 +354,10 @@
"tmgr.wait_tasks([task.uid for task in tasks])\n",
"\n",
"for task in tasks:\n",
- " print('--- %s:\\n%s\\n' % (task.uid, task.stdout.strip()))\n"
+ " print('--- %s:\\n%s\\n' % (task.uid, task.stdout.strip()))"
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "420ed233",
"metadata": {},
@@ -753,41 +376,10 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": null,
"id": "0fd464ed",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2023-05-18T01:27:39.271133Z",
- "iopub.status.busy": "2023-05-18T01:27:39.270540Z",
- "iopub.status.idle": "2023-05-18T01:27:45.174660Z",
- "shell.execute_reply": "2023-05-18T01:27:45.173670Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "submit: \u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m\n",
- "\u001b[39m\u001b[0mwait : \u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m\n",
- "\u001b[39m\u001b[0m\u001b[94m\tDONE : 1\n",
- "\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "-rw-rw---- 1 mturilli mturilli 24 May 18 03:27 /tmp/output.test.dat\n",
- "-rw-rw---- 1 mturilli mturilli 0 May 18 03:27 /tmp/output.test.err\n",
- "-rw-rw---- 1 mturilli mturilli 0 May 18 03:27 /tmp/output.test.out\n",
- "\n",
- " 61 104 3465\n",
- "\n"
- ]
- }
- ],
+ "metadata": {},
+ "outputs": [],
"source": [
"\n",
"td = rp.TaskDescription({'executable' : '/bin/sh',\n",
@@ -808,7 +400,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "a4bb97c2",
"metadata": {},
@@ -823,7 +414,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "200d8813",
"metadata": {},
@@ -851,55 +441,10 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": null,
"id": "059fa07e",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2023-05-18T01:27:45.178679Z",
- "iopub.status.busy": "2023-05-18T01:27:45.177969Z",
- "iopub.status.idle": "2023-05-18T01:27:49.365187Z",
- "shell.execute_reply": "2023-05-18T01:27:49.364347Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "submit: \u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m\n",
- "\u001b[39m\u001b[0mwait : \u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m\n",
- "\u001b[39m\u001b[0m\u001b[94m\tDONE : 1\n",
- "\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[... CONTENT SHORTENED ...]\n",
- "EL_ADDR=144.76.72.175:27017\n",
- "RP_BOOTSTRAP_0_REDIR=True\n",
- "RP_GTOD=/home/mturilli/radical.pilot.sandbox/rp.session.three.mturilli.019495.0002//pilot.0000//gtod\n",
- "RP_PILOT_ID=pilot.0000\n",
- "RP_PILOT_SANDBOX=/home/mturilli/radical.pilot.sandbox/rp.session.three.mturilli.019495.0002//pilot.0000/\n",
- "RP_PROF=/home/mturilli/radical.pilot.sandbox/rp.session.three.mturilli.019495.0002//pilot.0000//prof\n",
- "RP_PROF_TGT=/home/mturilli/radical.pilot.sandbox/rp.session.three.mturilli.019495.0002//pilot.0000//task.000058/task.000058.prof\n",
- "RP_RANK=0\n",
- "RP_RANKS=1\n",
- "RP_RESOURCE=local.localhost\n",
- "RP_RESOURCE_SANDBOX=/home/mturilli/radical.pilot.sandbox\n",
- "RP_SESSION_ID=rp.session.three.mturilli.019495.0002\n",
- "RP_SESSION_SANDBOX=/home/mturilli/radical.pilot.sandbox/rp.session.three.mturilli.019495.0002/\n",
- "RP_TASK_ID=task.000058\n",
- "RP_TASK_NAME=task.000058\n",
- "RP_TASK_SANDBOX=/home/mturilli/radical.pilot.sandbox/rp.session.three.mturilli.019495.0002//pilot.0000//task.000058\n",
- "RP_VENV_PATH=/home/mturilli/radical.pilot.sandbox/ve.local.localhost.1.34.0\n",
- "RP_VENV_TYPE=venv\n",
- "\n"
- ]
- }
- ],
+ "metadata": {},
+ "outputs": [],
"source": [
"td = rp.TaskDescription({'executable' : '/bin/sh',\n",
" 'arguments' : ['-c', 'printf \"FOO=$FOO\\nBAR=$BAR\\nSHELL=$SHELL\\n\"; env | grep RP_ | sort'],\n",
@@ -911,7 +456,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "70d849d8",
"metadata": {},
@@ -936,48 +480,10 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": null,
"id": "15728941",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2023-05-18T01:27:49.368303Z",
- "iopub.status.busy": "2023-05-18T01:27:49.367565Z",
- "iopub.status.idle": "2023-05-18T01:27:54.344539Z",
- "shell.execute_reply": "2023-05-18T01:27:54.343996Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "submit: \u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m\n",
- "\u001b[39m\u001b[0mwait : \u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m\n",
- "\u001b[39m\u001b[0m\u001b[94m\tDONE : 1\n",
- "\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Requirement already satisfied: pyyaml in /home/mturilli/ve-notebooks/lib/python3.10/site-packages (6.0)\n",
- "/home/mturilli/ve-notebooks/bin/python3\n",
- "Name: PyYAML\n",
- "Version: 6.0\n",
- "Summary: YAML parser and emitter for Python\n",
- "Home-page: https://pyyaml.org/\n",
- "Author: Kirill Simonov\n",
- "Author-email: xi@resolvent.net\n",
- "License: MIT\n",
- "Location: /home/mturilli/ve-notebooks/lib/python3.10/site-packages\n",
- "Requires: \n",
- "Required-by: jupyter-events, jupyter-nbextensions-configurator, myst-parser\n",
- "\n"
- ]
- }
- ],
+ "metadata": {},
+ "outputs": [],
"source": [
"td = rp.TaskDescription({'pre_exec' : ['. %s/bin/activate' % ve_path, \n",
" 'pip install pyyaml'],\n",
@@ -986,11 +492,10 @@
" })\n",
"task = tmgr.submit_tasks(td)\n",
"tmgr.wait_tasks([task.uid])\n",
- "print(task.stdout)\n"
+ "print(task.stdout)"
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "78f3f8a7",
"metadata": {},
@@ -1002,38 +507,10 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": null,
"id": "41467fc2",
- "metadata": {
- "execution": {
- "iopub.execute_input": "2023-05-18T01:27:54.346733Z",
- "iopub.status.busy": "2023-05-18T01:27:54.346503Z",
- "iopub.status.idle": "2023-05-18T01:28:25.421677Z",
- "shell.execute_reply": "2023-05-18T01:28:25.420783Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "submit: \u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m\n",
- "\u001b[39m\u001b[0mwait : \u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m\n",
- "\u001b[39m\u001b[0m\u001b[94m\tDONE : 1\n",
- "\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "/home/mturilli/radical.pilot.sandbox/rp.session.three.mturilli.019495.0002/pilot.0000/env/rp_named_env.test_env/bin/python3\n",
- "psutil 5.9.5\n",
- "\n"
- ]
- }
- ],
+ "metadata": {},
+ "outputs": [],
"source": [
"\n",
"pilot.prepare_env(env_name='test_env', \n",
@@ -1051,41 +528,23 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": null,
"id": "9c914fc2",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-05-18T01:28:25.424512Z",
- "iopub.status.busy": "2023-05-18T01:28:25.424291Z",
- "iopub.status.idle": "2023-05-18T01:28:25.429186Z",
- "shell.execute_reply": "2023-05-18T01:28:25.428627Z"
- },
"pycharm": {
"name": "#%%\n"
}
},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[93m\u001b[1m\n",
- "\u001b[39m\u001b[0m\u001b[93m\u001b[1m--------------------------------------------------------------------------------\n",
- "\u001b[39m\u001b[0m\u001b[93m\u001b[1mfinalize \n",
- "\u001b[39m\u001b[0m\u001b[93m\u001b[1m\n",
- "\u001b[39m\u001b[0m"
- ]
- }
- ],
+ "outputs": [],
"source": [
"report.header('finalize')\n",
- "# session.close()"
+ "session.close()"
]
}
],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@@ -1099,7 +558,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.6"
+ "version": "3.9.13"
},
"varInspector": {
"cols": {
diff --git a/docs/source/tutorials/multiple_pilots.ipynb b/docs/source/tutorials/multiple_pilots.ipynb
index c5d6267bc7..ea115d7f40 100644
--- a/docs/source/tutorials/multiple_pilots.ipynb
+++ b/docs/source/tutorials/multiple_pilots.ipynb
@@ -27,12 +27,6 @@
"\n",
"\n",
"\n",
- "__Note:__ For the initial setup regarding MongoDB see the tutorial [Getting Started](../getting_started.ipynb).\n",
- "\n",
- "
\n",
- "\n",
- "\n",
- "\n",
"__Note:__ In our examples, we will not show a progression bar while waiting for some operation to complete, e.g., while waiting for a pilot to stop. That is because the progression bar offered by RP's reporter does not work within a notebook. You could use it when executing an RP application as a standalone Python script.\n",
"\n",
"
\n"
diff --git a/docs/source/tutorials/staging_data.ipynb b/docs/source/tutorials/staging_data.ipynb
index f4b70c375b..6499bbd6f3 100644
--- a/docs/source/tutorials/staging_data.ipynb
+++ b/docs/source/tutorials/staging_data.ipynb
@@ -1,7 +1,6 @@
{
"cells": [
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -81,8 +80,9 @@
"- `radical.pilot.LINK` - local file symlink.\n",
"\n",
"Using appropriate data actions helps to improve the application runtime. It is known that I/O operations are expensive and can negatively impact the total execution time of an application. Thus, RP applications should be built considering that:\n",
- "* the most expensive I/O operations (`TRANSFER`, `MOVE`, `COPY`) should be applied for staging between the `client://` location and corresponding paths on the target platform, since they will be performed outside of the allocated resources and will be no resources idling (pilot job is not launched at this moment);\n",
- "* task staging between sandboxes should minimize the usage of such actions as `MOVE` and `COPY`, and use the `LINK` action if possible, since these operations will be executed within the allocated resources.\n",
+ "\n",
+ "- the most expensive I/O operations (`TRANSFER`, `MOVE`, `COPY`) should be applied for staging between the `client://` location and corresponding paths on the target platform, since they will be performed outside of the allocated resources and will be no resources idling (pilot job is not launched at this moment);\n",
+ "- task staging between sandboxes should minimize the usage of such actions as `MOVE` and `COPY`, and use the `LINK` action if possible, since these operations will be executed within the allocated resources.\n",
"\n",
"In the example from the section [Examples](#Examples), we demonstrate that if all tasks have the same input data, then this data can be located in a shared space (e.g., staged to the `pilot://` location) and be linked into each task's sandbox (e.g., a link per input file within the `task://` location).\n",
"\n",
@@ -95,15 +95,17 @@
"\n",
"### Simplified directive format\n",
"\n",
- "RP gives some flexibility in the description of staging between the client side and the sandboxes for pilot and task. Thus, if a user provides just names (absolute or relative paths, e.g., names of files or directories), then RP expands them into corresponding directives. \n",
- "- If a string directive is a single path, then after expanding it, the _source_ will be a provided path within the `client://` location, while the _target_ will be a base name from a provided path within the `pilot://` or the `task://` location for [radical.pilot.PilotDescription](../apidoc.rst) or [radical.pilot.TaskDescription](../apidoc.rst) respectively.\n",
+ "RP gives some flexibility in the description of staging between the client side and the sandboxes for pilot and task. Thus, if a user provides just names (absolute or relative paths, e.g., names of files or directories), then RP expands them into corresponding directives.\n",
+ "\n",
+ "- If a string directive is a single path, then after expanding it, the _source_ will be a provided path within the `client://` location, while the _target_ will be a base name from a provided path within the `pilot://` or the `task://` location for [radical.pilot.PilotDescription](../apidoc.rst#pilotdescription) or [radical.pilot.TaskDescription](../apidoc.rst#taskdescription) respectively.\n",
"- Having directional characters `>`, `<` within a string directive defines the direction of the staging between corresponding paths:\n",
- " - Input staging: `source > target`, the _source_ defines a path within the `client://` location, and the _target_ defines a path within the `pilot://` or the `task://` location for [radical.pilot.PilotDescription](../apidoc.rst) or [radical.pilot.TaskDescription](../apidoc.rst) respectively.\n",
- " - Output staging: `target < source` (applied for [radical.pilot.TaskDescription](../apidoc.rst) only), the _source_ defines a path within the `task://` location, and the _target_ defines a path within the `client://` location.\n",
+ "\n",
+ " - Input staging: `source > target`, the _source_ defines a path within the `client://` location, and the _target_ defines a path within the `pilot://` or the `task://` location for [radical.pilot.PilotDescription](../apidoc.rst#pilotdescription) or [radical.pilot.TaskDescription](../apidoc.rst#taskdescription) respectively.\n",
+ " - Output staging: `target < source` (applied for [radical.pilot.TaskDescription](../apidoc.rst#taskdescription) only), the _source_ defines a path within the `task://` location, and the _target_ defines a path within the `client://` location.\n",
"\n",
"Examples of the staging directives being expanded:\n",
"\n",
- "[radical.pilot.PilotDescription.input_staging](../apidoc.rst)\n",
+ "[radical.pilot.PilotDescription.input_staging](../apidoc.rst#radical.pilot.PilotDescription.input_staging)\n",
"```shell\n",
"in : [ '/tmp/input_data/' ]\n",
"out: [{'source' : 'client:///tmp/input_data',\n",
@@ -117,7 +119,7 @@
" 'flags' : radical.pilot.CREATE_PARENTS}]\n",
"```\n",
"\n",
- "[radical.pilot.TaskDescription.input_staging](../apidoc.rst)\n",
+ "[radical.pilot.TaskDescription.input_staging](../apidoc.rst#radical.pilot.TaskDescription.input_staging)\n",
"```shell\n",
"in : [ '/tmp/task_input.txt' ]\n",
"out: [{'source' : 'client:///tmp/task_input.txt',\n",
@@ -126,7 +128,7 @@
" 'flags' : radical.pilot.CREATE_PARENTS}]\n",
"```\n",
"\n",
- "[radical.pilot.TaskDescription.output_staging](../apidoc.rst)\n",
+ "[radical.pilot.TaskDescription.output_staging](../apidoc.rst#radical.pilot.TaskDescription.output_staging)\n",
"```shell\n",
"in : [ 'collected.dat < output.txt' ]\n",
"out: [{'source' : 'task:///output.txt',\n",
@@ -138,53 +140,25 @@
"## Examples\n",
"\n",
"\n",
- " \n",
- "__Note:__ For setting up MongoDB see the [Getting Started](../getting_started.ipynb) tutorial.\n",
- "\n",
- "
\n",
- "\n",
- "\n",
"\n",
- "__Note:__ In our examples, we will not show a progression bar while waiting for some operation to complete, e.g., while waiting for a pilot to stop. That is because the progression bar offered by RP's reporter does not work well within a notebook. You could use the reporter's progression bar when executing your RP application as a standalone Python script.\n",
+ "__Note:__ In these examples, we will not show a progression bar while waiting for some operation to complete, e.g., while waiting for a pilot to stop. That is because the progression bar offered by RP's reporter does not work well within a notebook. You could use the reporter's progression bar when executing your RP application as a standalone Python script.\n",
"\n",
"
"
]
},
{
"cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2023-05-18T01:28:46.387260Z",
- "iopub.status.busy": "2023-05-18T01:28:46.386951Z",
- "iopub.status.idle": "2023-05-18T01:28:46.398220Z",
- "shell.execute_reply": "2023-05-18T01:28:46.397345Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "env: RADICAL_REPORT_ANIME=FALSE\n"
- ]
- }
- ],
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
"source": [
"%env RADICAL_REPORT_ANIME=FALSE"
]
},
{
"cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2023-05-18T01:28:46.434424Z",
- "iopub.status.busy": "2023-05-18T01:28:46.434209Z",
- "iopub.status.idle": "2023-05-18T01:28:46.562315Z",
- "shell.execute_reply": "2023-05-18T01:28:46.561538Z"
- }
- },
+ "execution_count": null,
+ "metadata": {},
"outputs": [],
"source": [
"import radical.pilot as rp\n",
@@ -193,28 +167,9 @@
},
{
"cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2023-05-18T01:28:46.565283Z",
- "iopub.status.busy": "2023-05-18T01:28:46.565052Z",
- "iopub.status.idle": "2023-05-18T01:28:58.956760Z",
- "shell.execute_reply": "2023-05-18T01:28:58.955740Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[94mnew session: \u001b[39m\u001b[0m[rp.session.three.mturilli.019495.0004]\u001b[39m\u001b[0m\u001b[94m \\\n",
- "database : \u001b[39m\u001b[0m[mongodb://rct-tutorial:****@95.217.193.116:27017/rct-tutorial]\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m\u001b[94mcreate pilot manager\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m\u001b[94mcreate task manager\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m"
- ]
- }
- ],
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
"source": [
"session = rp.Session()\n",
"pmgr = rp.PilotManager(session=session)\n",
@@ -222,62 +177,59 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
- "For this example, we create a directory `input_dir` within the current working directory, and place a file into this directory. That file will be the input data for every task (this input file is referred in the [radical.pilot.TaskDescription.arguments](../apidoc.rst) attribute). The newly created directory `input_dir` is staged into the `pilot://` location with all its files."
+ "For this example, create a new directory `input_dir` within the current working directory, and place a file into this directory. That file will be the input data for every task (this input file is referred in the [radical.pilot.TaskDescription.arguments](../apidoc.rst) attribute).\n",
+ "\n",
+ "\n",
+ "\n",
+ "__Warning:__ You need to ensure that the directory, where your script will create the data for staging, is writable. Also, you are responsible to cleanup that data after it is staged.\n",
+ "\n",
+ "
"
]
},
{
"cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2023-05-18T01:28:58.960227Z",
- "iopub.status.busy": "2023-05-18T01:28:58.959826Z",
- "iopub.status.idle": "2023-05-18T01:28:59.079197Z",
- "shell.execute_reply": "2023-05-18T01:28:59.077877Z"
- }
- },
+ "execution_count": null,
+ "metadata": {},
"outputs": [],
"source": [
- "!mkdir -p ./input_dir"
+ "import os\n",
+ "\n",
+ "input_dir = os.path.join(os.getcwd(), 'input_dir')\n",
+ "os.makedirs(input_dir, exist_ok=True)\n",
+ "\n",
+ "with open(input_dir + '/input.txt', 'w') as f:\n",
+ " f.write('Staged data (task_id=$RP_TASK_ID | pilot_id=$RP_PILOT_ID | session_id=$RP_SESSION_ID)')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You will stage the newly created directory `input_dir` with all its files into the `pilot://` location.\n",
+ "\n",
+ "\n",
+ "\n",
+ "__Note:__ If provided path for `input_staging` is not an absolute path, then RP will look for it within the current working directory. Using absolute paths will guarantee that the staging data will be located correctly.\n",
+ "\n",
+ "
"
]
},
{
"cell_type": "code",
- "execution_count": 5,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2023-05-18T01:28:59.083906Z",
- "iopub.status.busy": "2023-05-18T01:28:59.083229Z",
- "iopub.status.idle": "2023-05-18T01:28:59.456334Z",
- "shell.execute_reply": "2023-05-18T01:28:59.455078Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[94msubmit 1 pilot(s)\u001b[39m\u001b[0m\n",
- " pilot.0000 local.localhost 2 cores 0 gpus\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m"
- ]
- }
- ],
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
"source": [
"# Staging directives for the pilot.\n",
"\n",
- "with open('./input_dir/input.txt', 'w') as f:\n",
- " f.write('Staged data (task_id=$RP_TASK_ID | pilot_id=$RP_PILOT_ID | session_id=$RP_SESSION_ID)')\n",
- "\n",
"pd = rp.PilotDescription({\n",
" 'resource' : 'local.localhost',\n",
" 'cores' : 2,\n",
" 'runtime' : 15,\n",
- " 'input_staging': ['input_dir'],\n",
+ " 'input_staging': [input_dir],\n",
" 'exit_on_error': False\n",
"})\n",
"\n",
@@ -294,7 +246,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -309,38 +260,11 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": null,
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-05-18T01:28:59.460398Z",
- "iopub.status.busy": "2023-05-18T01:28:59.460055Z",
- "iopub.status.idle": "2023-05-18T01:29:21.091083Z",
- "shell.execute_reply": "2023-05-18T01:29:21.090256Z"
- }
+ "tags": []
},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "submit: \u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m\n",
- "\u001b[39m\u001b[0mwait : \u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m#\u001b[39m\u001b[0m\n",
- "\u001b[39m\u001b[0m\u001b[94m\tDONE : 2\n",
- "\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m"
- ]
- },
- {
- "data": {
- "text/plain": [
- "['DONE', 'DONE']"
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": [
"# Staging directives for tasks.\n",
"\n",
@@ -352,8 +276,8 @@
" output = 'output.%d.txt' % idx\n",
"\n",
" td = rp.TaskDescription({\n",
- " 'executable' : 'eval',\n",
- " 'arguments' : ['echo \"$(cat input.txt)\"'],\n",
+ " 'executable' : '/bin/echo',\n",
+ " 'arguments' : ['$(cat input.txt)'],\n",
" 'stdout' : output,\n",
" # link file from the pilot sandbox to the task sandbox\n",
" 'input_staging' : [{'source': 'pilot:///input_dir/input.txt',\n",
@@ -373,7 +297,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
@@ -382,28 +305,9 @@
},
{
"cell_type": "code",
- "execution_count": 7,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2023-05-18T01:29:21.094369Z",
- "iopub.status.busy": "2023-05-18T01:29:21.094145Z",
- "iopub.status.idle": "2023-05-18T01:29:22.102509Z",
- "shell.execute_reply": "2023-05-18T01:29:22.101554Z"
- }
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['/home/mturilli/github/radical.pilot/docs/source/tutorials/output_dir/output.0.txt',\n",
- " '/home/mturilli/github/radical.pilot/docs/source/tutorials/output_dir/output.1.txt']"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
"source": [
"# Staging data from the pilot sandbox to the client working directory\n",
"\n",
@@ -414,56 +318,18 @@
},
{
"cell_type": "code",
- "execution_count": 8,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2023-05-18T01:29:22.106080Z",
- "iopub.status.busy": "2023-05-18T01:29:22.105703Z",
- "iopub.status.idle": "2023-05-18T01:29:22.225907Z",
- "shell.execute_reply": "2023-05-18T01:29:22.224513Z"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Staged data (task_id=task.000000 | pilot_id=pilot.0000 | session_id=rp.session.three.mturilli.019495.0004)\r\n",
- "Staged data (task_id=task.000001 | pilot_id=pilot.0000 | session_id=rp.session.three.mturilli.019495.0004)\r\n"
- ]
- }
- ],
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
"source": [
"!cat output_dir/*"
]
},
{
"cell_type": "code",
- "execution_count": 9,
- "metadata": {
- "execution": {
- "iopub.execute_input": "2023-05-18T01:29:22.230180Z",
- "iopub.status.busy": "2023-05-18T01:29:22.229776Z",
- "iopub.status.idle": "2023-05-18T01:29:42.278669Z",
- "shell.execute_reply": "2023-05-18T01:29:42.277689Z"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[94mclosing session rp.session.three.mturilli.019495.0004\u001b[39m\u001b[0m\u001b[94m \\\n",
- "close task manager\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m\u001b[94mclose pilot manager\u001b[39m\u001b[0m\u001b[94m \\\n",
- "wait for 1 pilot(s)\n",
- " \u001b[39m\u001b[0m\u001b[93m timeout\n",
- "\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m\u001b[94msession lifetime: 55.6s\u001b[39m\u001b[0m\u001b[92m ok\n",
- "\u001b[39m\u001b[0m"
- ]
- }
- ],
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
"source": [
"session.close(cleanup=True)"
]
@@ -485,7 +351,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.10.6"
+ "version": "3.9.13"
}
},
"nbformat": 4,
diff --git a/docs/source/tutorials/submission.ipynb b/docs/source/tutorials/submission.ipynb
index 0e8f408b46..a7385f4051 100644
--- a/docs/source/tutorials/submission.ipynb
+++ b/docs/source/tutorials/submission.ipynb
@@ -187,7 +187,6 @@
" ```\n",
" python3 -m venv /ve/my_rp_ve\n",
" . ~/ve/my_rp_ve/bin/activate\n",
- " export RADICAL_PILOT_DBURL=mongodb://user:password@ip:port/db_name\n",
" python3 my_application.py\n",
" ```\n",
"\n",
@@ -206,7 +205,6 @@
"#SBATCH --mail-type=all # Send email at begin and end of job\n",
"#SBATCH -A myproject # Project/Allocation name (req'd if you have more than 1)\n",
"\n",
- "export RADICAL_PILOT_DBURL=mongodb://user:password@ip:port/db_name\n",
"python my_application.py\n",
"```\n",
"\n",
@@ -230,7 +228,6 @@
"ssh username@frontera.tacc.utexas.edu\n",
"python3 -m venv /ve/my_rp_ve\n",
". ~/ve/my_rp_ve/bin/activate\n",
- "export RADICAL_PILOT_DBURL=mongodb://user:password@ip:port/db_name\n",
"python3 my_application.py\n",
"```\n",
"\n",
diff --git a/examples/00_getting_started.py b/examples/00_getting_started.py
index 4d4664322c..4a0595d457 100755
--- a/examples/00_getting_started.py
+++ b/examples/00_getting_started.py
@@ -25,13 +25,14 @@
report.title('Getting Started (RP version %s)' % rp.version)
# use the resource specified as argument, fall back to localhost
+ resource = None
if len(sys.argv) > 2: report.exit('Usage:\t%s [resource]\n\n' % sys.argv[0])
elif len(sys.argv) == 2: resource = sys.argv[1]
else : resource = 'local.localhost'
# Create a new session. No need to try/except this: if session creation
- # fails, there is not much we can do anyways...
+ # fails, there is not much we can do anyway...
session = rp.Session()
# all other pilot code is now tried/excepted. If an exception is caught, we
@@ -41,8 +42,8 @@
try:
# read the config used for resource details
- config = ru.read_json('%s/config.json'
- % os.path.dirname(__file__)).get(resource, {})
+ config = ru.read_json('%s/config.json' %
+ os.path.dirname(__file__)).get(resource, {})
pmgr = rp.PilotManager(session=session)
tmgr = rp.TaskManager(session=session)
@@ -53,7 +54,7 @@
# Define an [n]-core local pilot that runs for [x] minutes
# Here we use a dict to initialize the description object
pd_init = {'resource' : resource,
- 'runtime' : 30, # pilot runtime (min)
+ 'runtime' : 15, # pilot runtime (min)
'exit_on_error' : True,
'project' : config.get('project'),
'queue' : config.get('queue'),
@@ -81,7 +82,8 @@
# create a new task description, and fill it.
td = rp.TaskDescription()
- td.executable = '/bin/date'
+ td.executable = '/bin/sleep'
+ td.arguments = ['1']
td.ranks = 1
td.cores_per_rank = 1
diff --git a/examples/01_task_details.py b/examples/01_task_details.py
index 35e65d448f..50072854fc 100755
--- a/examples/01_task_details.py
+++ b/examples/01_task_details.py
@@ -5,6 +5,7 @@
import os
import sys
+import time
verbose = os.environ.get('RADICAL_PILOT_VERBOSE', 'REPORT')
os.environ['RADICAL_PILOT_VERBOSE'] = verbose
@@ -45,53 +46,54 @@
# read the config used for resource details
report.info('read config')
- config = ru.read_json('%s/config.json' % os.path.dirname(os.path.abspath(__file__)))
+ config = ru.read_json('%s/config.json' % os.path.dirname(__file__))
report.ok('>>ok\n')
report.header('submit pilots')
# Add a PilotManager. PilotManagers manage one or more pilots.
pmgr = rp.PilotManager(session=session)
+ tmgr = rp.TaskManager(session=session)
# Define an [n]-core local pilot that runs for [x] minutes
# Here we use a dict to initialize the description object
pd_init = {'resource' : resource,
- 'runtime' : 15, # pilot runtime (min)
+ 'runtime' : 300,
'exit_on_error' : True,
'project' : config[resource].get('project', None),
'queue' : config[resource].get('queue', None),
'access_schema' : config[resource].get('schema', None),
- 'cores' : config[resource].get('cores', 1),
+ 'cores' : 1024 * 16,
'gpus' : config[resource].get('gpus', 0),
}
pdesc = rp.PilotDescription(pd_init)
# Launch the pilot.
pilot = pmgr.submit_pilots(pdesc)
-
-
+ # pmgr.wait_pilots(uids=pilot.uid, state=rp.PMGR_ACTIVE)
report.header('submit tasks')
# Register the pilot in a TaskManager object.
- tmgr = rp.TaskManager(session=session)
tmgr.add_pilots(pilot)
# Create a workload of tasks.
# Each task runs '/bin/date'.
- n = 128 # number of tasks to run
- report.info('create %d task description(s)\n\t' % n)
+ n = 1 * 1024 # number of tasks to run
+ report.info('create %d task description(s)\n' % n)
tds = list()
+ report.progress_tgt(n, label='create')
for i in range(0, n):
# create a new task description, and fill it.
# Here we don't use dict initialization.
td = rp.TaskDescription()
td.executable = '/bin/date'
+ td.sandbox = 'task_sandbox'
tds.append(td)
report.progress()
- report.ok('>>ok\n')
+ report.progress_done()
# Submit the previously created task descriptions to the
# PilotManager. This will trigger the selected scheduler to start
@@ -103,10 +105,10 @@
tmgr.wait_tasks()
report.info('\n')
- for task in tasks:
- report.plain(' * %s: %s, exit: %3s, out: %s\n'
+ for task in tasks[:10]:
+ report.plain(' * %s: %s, exit: %3s, out: %s'
% (task.uid, task.state[:4],
- task.exit_code, task.stdout[:35]))
+ task.exit_code, task.stdout))
# get some more details for one task:
task_dict = tasks[0].as_dict()
@@ -115,13 +117,6 @@
report.plain("exit code : %s\n" % task_dict['exit_code'])
report.plain("stdout : %s\n" % task_dict['stdout'])
- # get some more details for one task:
- task_dict = tasks[1].as_dict()
- report.plain("task workdir : %s\n" % task_dict['task_sandbox'])
- report.plain("pilot id : %s\n" % task_dict['pilot'])
- report.plain("exit code : %s\n" % task_dict['exit_code'])
- report.plain("exit stdout : %s\n" % task_dict['stdout'])
-
except Exception as e:
# Something unexpected happened in the pilot code above
@@ -139,7 +134,7 @@
# always clean up the session, no matter if we caught an exception or
# not. This will kill all remaining pilots.
report.header('finalize')
- session.close()
+ session.close(download=True)
report.header()
diff --git a/examples/02_failing_tasks.py b/examples/02_failing_tasks.py
index 3214c05148..2551b73f3d 100755
--- a/examples/02_failing_tasks.py
+++ b/examples/02_failing_tasks.py
@@ -142,7 +142,7 @@
# not. This will kill all remaining pilots.
report.header('finalize')
if session:
- session.close(cleanup=False)
+ session.close()
report.header()
diff --git a/examples/03_multiple_pilots.py b/examples/03_multiple_pilots.py
index a67627d9c3..37db17df60 100755
--- a/examples/03_multiple_pilots.py
+++ b/examples/03_multiple_pilots.py
@@ -71,6 +71,10 @@
# Launch the pilots.
pilots = pmgr.submit_pilots(pdescs)
+ for pilot in pilots:
+ pilot.prepare_env('numpy_env', {'type' : 'virtualenv',
+ 'setup': ['numpy']})
+
for gen in range(1):
@@ -135,7 +139,7 @@
# always clean up the session, no matter if we caught an exception or
# not. This will kill all remaining pilots.
report.header('finalize')
- session.close(cleanup=False)
+ session.close()
report.header()
diff --git a/examples/12_task_env.py b/examples/12_task_env.py
index 6e69b6a0e4..8f92787d50 100755
--- a/examples/12_task_env.py
+++ b/examples/12_task_env.py
@@ -1,154 +1,154 @@
-#!/usr/bin/env python
-
-__copyright__ = 'Copyright 2013-2014, http://radical.rutgers.edu'
-__license__ = 'MIT'
-
-import os
-import sys
-
-verbose = os.environ.get('RADICAL_PILOT_VERBOSE', 'REPORT')
-os.environ['RADICAL_PILOT_VERBOSE'] = verbose
-
-import radical.pilot as rp
-import radical.utils as ru
-
-
-# ------------------------------------------------------------------------------
-#
-# READ the RADICAL-Pilot documentation: https://radicalpilot.readthedocs.io/
-#
-# ------------------------------------------------------------------------------
-
-
-# ------------------------------------------------------------------------------
-#
-if __name__ == '__main__':
-
- # we use a reporter class for nicer output
- report = ru.Reporter(name='radical.pilot')
- report.title('Getting Started (RP version %s)' % rp.version)
-
- # use the resource specified as argument, fall back to localhost
- if len(sys.argv) > 2: report.exit('Usage:\t%s [resource]\n\n' % sys.argv[0])
- elif len(sys.argv) == 2: resource = sys.argv[1]
- else : resource = 'local.localhost'
-
- # Create a new session. No need to try/except this: if session creation
- # fails, there is not much we can do anyways...
- session = rp.Session()
-
- # all other pilot code is now tried/excepted. If an exception is caught, we
- # can rely on the session object to exist and be valid, and we can thus tear
- # the whole RP stack down via a 'session.close()' call in the 'finally'
- # clause...
- try:
-
- # read the config used for resource details
- report.info('read config')
- config = ru.read_json('%s/config.json' % os.path.dirname(__file__))
- report.ok('>>ok\n')
-
- report.header('submit pilots')
-
- # Add a PilotManager. PilotManagers manage one or more pilots.
- pmgr = rp.PilotManager(session=session)
-
- # Define an [n]-core local pilot that runs for [x] minutes
- # Here we use a dict to initialize the description object
- pd_init = {'resource' : resource,
- 'runtime' : 15, # pilot runtime (min)
- 'exit_on_error' : True,
- 'project' : config[resource].get('project', None),
- 'queue' : config[resource].get('queue', None),
- 'access_schema' : config[resource].get('schema', None),
- 'cores' : config[resource].get('cores', 1),
- 'gpus' : config[resource].get('gpus', 0),
- }
- pdesc = rp.PilotDescription(pd_init)
-
- # Launch the pilot.
- pilot = pmgr.submit_pilots(pdesc)
-
- pilot.prepare_env('numpy_env', {'type' : 'virtualenv',
- 'version': '3.7',
- 'setup' : ['numpy']})
-
-
- report.header('submit tasks')
-
- # Register the pilot in a TaskManager object.
- tmgr = rp.TaskManager(session=session)
- tmgr.add_pilots(pilot)
-
- # Create a workload of tasks.
- # Each task runs '/bin/date'.
- n = 2 # number of tasks to run
- report.info('create %d task description(s)\n\t' % n)
-
- tds = list()
- for i in range(0, n):
-
- # create a new task description, and fill it.
- # Here we don't use dict initialization.
- td = rp.TaskDescription()
- td.executable = 'python3'
- td.arguments = ['-c', 'import numpy; print(numpy.__file__)']
- td.named_env = 'numpy_env'
- tds.append(td)
- report.progress()
-
- report.ok('>>ok\n')
-
- # Submit the previously created task descriptions to the
- # PilotManager. This will trigger the selected scheduler to start
- # assigning tasks to the pilots.
- tasks = tmgr.submit_tasks(tds)
-
- # Wait for all tasks to reach a final state (DONE, CANCELED or FAILED).
- report.header('gather results')
- tmgr.wait_tasks()
-
- report.info('\n')
- for task in tasks:
- report.plain(' * %s: %s, exit: %3s, out: %s\n'
- % (task.uid, task.state[:4],
- task.exit_code, task.stdout[:35]))
-
- # get some more details for one task:
- task_dict = tasks[0].as_dict()
- report.plain("task workdir : %s\n" % task_dict['task_sandbox'])
- report.plain("pilot id : %s\n" % task_dict['pilot'])
- report.plain("exit code : %s\n" % task_dict['exit_code'])
- report.plain("stdout : %s\n" % task_dict['stdout'])
-
- # get some more details for one task:
- task_dict = tasks[1].as_dict()
- report.plain("task workdir : %s\n" % task_dict['task_sandbox'])
- report.plain("pilot id : %s\n" % task_dict['pilot'])
- report.plain("exit code : %s\n" % task_dict['exit_code'])
- report.plain("exit stdout : %s\n" % task_dict['stdout'])
-
-
- except Exception as e:
- # Something unexpected happened in the pilot code above
- report.error('caught Exception: %s\n' % e)
- raise
-
- except (KeyboardInterrupt, SystemExit):
- # the callback called sys.exit(), and we can here catch the
- # corresponding KeyboardInterrupt exception for shutdown. We also catch
- # SystemExit (which gets raised if the main threads exits for some other
- # reason).
- report.warn('exit requested\n')
-
- finally:
- # always clean up the session, no matter if we caught an exception or
- # not. This will kill all remaining pilots.
- report.header('finalize')
- session.close()
-
- report.header()
-
-
-# ------------------------------------------------------------------------------
-
+#!/usr/bin/env python3
+
+__copyright__ = 'Copyright 2013-2014, http://radical.rutgers.edu'
+__license__ = 'MIT'
+
+import os
+import sys
+
+verbose = os.environ.get('RADICAL_PILOT_VERBOSE', 'REPORT')
+os.environ['RADICAL_PILOT_VERBOSE'] = verbose
+
+import radical.pilot as rp
+import radical.utils as ru
+
+
+# ------------------------------------------------------------------------------
+#
+# READ the RADICAL-Pilot documentation: https://radicalpilot.readthedocs.io/
+#
+# ------------------------------------------------------------------------------
+
+
+# ------------------------------------------------------------------------------
+#
+if __name__ == '__main__':
+
+ # we use a reporter class for nicer output
+ report = ru.Reporter(name='radical.pilot')
+ report.title('Getting Started (RP version %s)' % rp.version)
+
+ # use the resource specified as argument, fall back to localhost
+ if len(sys.argv) > 2: report.exit('Usage:\t%s [resource]\n\n' % sys.argv[0])
+ elif len(sys.argv) == 2: resource = sys.argv[1]
+ else : resource = 'local.localhost'
+
+ # Create a new session. No need to try/except this: if session creation
+ # fails, there is not much we can do anyways...
+ session = rp.Session()
+
+ # all other pilot code is now tried/excepted. If an exception is caught, we
+ # can rely on the session object to exist and be valid, and we can thus tear
+ # the whole RP stack down via a 'session.close()' call in the 'finally'
+ # clause...
+ try:
+
+ # read the config used for resource details
+ report.info('read config')
+ config = ru.read_json('%s/config.json' % os.path.dirname(__file__))
+ report.ok('>>ok\n')
+
+ report.header('submit pilots')
+
+ # Add a PilotManager. PilotManagers manage one or more pilots.
+ pmgr = rp.PilotManager(session=session)
+
+ # Define an [n]-core local pilot that runs for [x] minutes
+ # Here we use a dict to initialize the description object
+ pd_init = {'resource' : resource,
+ 'runtime' : 15, # pilot runtime (min)
+ 'exit_on_error' : True,
+ 'project' : config[resource].get('project', None),
+ 'queue' : config[resource].get('queue', None),
+ 'access_schema' : config[resource].get('schema', None),
+ 'cores' : config[resource].get('cores', 1),
+ 'gpus' : config[resource].get('gpus', 0),
+ }
+ pdesc = rp.PilotDescription(pd_init)
+
+ # Launch the pilot.
+ pilot = pmgr.submit_pilots(pdesc)
+
+ report.header('prepare task env')
+ pilot.prepare_env('numpy_env', {'type' : 'virtualenv',
+ 'setup': ['numpy']})
+ report.ok('ok')
+
+ report.header('submit tasks')
+
+ # Register the pilot in a TaskManager object.
+ tmgr = rp.TaskManager(session=session)
+ tmgr.add_pilots(pilot)
+
+ # Create a workload of tasks.
+ # Each task runs '/bin/date'.
+ n = 2 # number of tasks to run
+ report.info('create %d task description(s)\n\t' % n)
+
+ tds = list()
+ for i in range(0, n):
+
+ # create a new task description, and fill it.
+ # Here we don't use dict initialization.
+ td = rp.TaskDescription()
+ td.executable = 'python3'
+ td.arguments = ['-c', 'import numpy; print(numpy.__file__)']
+ td.named_env = 'numpy_env'
+ tds.append(td)
+ report.progress()
+
+ report.ok('>>ok\n')
+
+ # Submit the previously created task descriptions to the
+ # PilotManager. This will trigger the selected scheduler to start
+ # assigning tasks to the pilots.
+ tasks = tmgr.submit_tasks(tds)
+
+ # Wait for all tasks to reach a final state (DONE, CANCELED or FAILED).
+ report.header('gather results')
+ tmgr.wait_tasks()
+
+ report.info('\n')
+ for task in tasks:
+ report.plain(' * %s: %s, exit: %3s, out: %s\n'
+ % (task.uid, task.state[:4],
+ task.exit_code, task.stdout[:35]))
+
+ # get some more details for one task:
+ task_dict = tasks[0].as_dict()
+ report.plain("task workdir : %s\n" % task_dict['task_sandbox'])
+ report.plain("pilot id : %s\n" % task_dict['pilot'])
+ report.plain("exit code : %s\n" % task_dict['exit_code'])
+ report.plain("stdout : %s\n" % task_dict['stdout'])
+
+ # get some more details for one task:
+ task_dict = tasks[1].as_dict()
+ report.plain("task workdir : %s\n" % task_dict['task_sandbox'])
+ report.plain("pilot id : %s\n" % task_dict['pilot'])
+ report.plain("exit code : %s\n" % task_dict['exit_code'])
+ report.plain("exit stdout : %s\n" % task_dict['stdout'])
+
+
+ except Exception as e:
+ # Something unexpected happened in the pilot code above
+ report.error('caught Exception: %s\n' % e)
+ raise
+
+ except (KeyboardInterrupt, SystemExit):
+ # the callback called sys.exit(), and we can here catch the
+ # corresponding KeyboardInterrupt exception for shutdown. We also catch
+ # SystemExit (which gets raised if the main threads exits for some other
+ # reason).
+ report.warn('exit requested\n')
+
+ finally:
+ # always clean up the session, no matter if we caught an exception or
+ # not. This will kill all remaining pilots.
+ report.header('finalize')
+ session.close()
+
+ report.header()
+
+
+# ------------------------------------------------------------------------------
+
diff --git a/examples/agent_services.py b/examples/agent_services.py
index f854af6536..a0ce10a51d 100755
--- a/examples/agent_services.py
+++ b/examples/agent_services.py
@@ -39,7 +39,10 @@
report.header('submit pilots')
- # Add a PilotManager. PilotManagers manage one or more pilots.
+ # Also define a (dummy) service to be run by the pilot
+ sd = rp.TaskDescription({'executable': '/bin/sh',
+ 'arguments' : ['-c', 'radical-pilot-service-signal'],
+ 'named_env' : 'rp'})
# Define an [n]-core local pilot that runs for [x] minutes
# Here we use a dict to initialize the description object
@@ -51,9 +54,7 @@
'access_schema' : config.get('schema'),
'cores' : config.get('cores', 1),
'gpus' : config.get('gpus', 0),
- # TODO create shell script
- 'services' :[rp.TaskDescription({'executable':'free -h'}),
- rp.TaskDescription({'executable':'free -h'}) ]
+ 'services' : [sd, sd]
}
pdesc = rp.PilotDescription(pd_init)
diff --git a/examples/data_staging/io_staging_dict.py b/examples/data_staging/io_staging_dict.py
index 2a48b781d3..2828c3fdee 100755
--- a/examples/data_staging/io_staging_dict.py
+++ b/examples/data_staging/io_staging_dict.py
@@ -140,7 +140,7 @@ def task_state_cb (task, state):
# the above is equivalent to
#
- # session.close (cleanup=True, terminate=True)
+ # session.close (terminate=True)
#
# it will thus both clean out the session's database record, and kill
# all remaining pilots (none in our example).
diff --git a/examples/data_staging/io_staging_pipeline.py b/examples/data_staging/io_staging_pipeline.py
index 21d7c880b2..f124c1c4a8 100755
--- a/examples/data_staging/io_staging_pipeline.py
+++ b/examples/data_staging/io_staging_pipeline.py
@@ -160,7 +160,7 @@ def task_state_cb (task, state):
# the above is equivalent to
#
- # session.close (cleanup=True, terminate=True)
+ # session.close (terminate=True)
#
# it will thus both clean out the session's database record, and kill
# all remaining pilots (none in our example).
diff --git a/examples/data_staging/io_staging_shared.py b/examples/data_staging/io_staging_shared.py
index ba1c09bc91..f85d55e7f9 100755
--- a/examples/data_staging/io_staging_shared.py
+++ b/examples/data_staging/io_staging_shared.py
@@ -138,7 +138,7 @@
# the above is equivalent to
#
- # session.close (cleanup=True, terminate=True)
+ # session.close (terminate=True)
#
# it will thus both clean out the session's database record, and kill
# all remaining pilots (none in our example).
diff --git a/examples/data_staging/io_staging_simple.py b/examples/data_staging/io_staging_simple.py
index 18fc2c02a6..8e12a36e81 100755
--- a/examples/data_staging/io_staging_simple.py
+++ b/examples/data_staging/io_staging_simple.py
@@ -129,7 +129,7 @@ def task_state_cb (task, state):
# the above is equivalent to
#
- # session.close (cleanup=True, terminate=True)
+ # session.close (terminate=True)
#
# it will thus both clean out the session's database record, and kill
# all remaining pilots (none in our example).
diff --git a/examples/docs/chained_tasks.py b/examples/docs/chained_tasks.py
index f8f46f4e27..ca55868773 100755
--- a/examples/docs/chained_tasks.py
+++ b/examples/docs/chained_tasks.py
@@ -164,7 +164,7 @@
# the above is equivalent to
#
- # session.close (cleanup=True, terminate=True)
+ # session.close (terminate=True)
#
# it will thus both clean out the session's database record, and kill
# all remaining pilots (none in our example).
diff --git a/examples/docs/coupled_tasks.py b/examples/docs/coupled_tasks.py
index d4020ce958..5b1dc9b744 100755
--- a/examples/docs/coupled_tasks.py
+++ b/examples/docs/coupled_tasks.py
@@ -174,7 +174,7 @@
# the above is equivalent to
#
- # session.close (cleanup=True, terminate=True)
+ # session.close (terminate=True)
#
# it will thus both clean out the session's database record, and kill
# all remaining pilots (none in our example).
diff --git a/examples/docs/mpi_tasks.py b/examples/docs/mpi_tasks.py
index 96ecdc29d8..39a2349c52 100755
--- a/examples/docs/mpi_tasks.py
+++ b/examples/docs/mpi_tasks.py
@@ -177,7 +177,7 @@ def task_state_cb (task, state):
# the above is equivalent to
#
- # session.close (cleanup=True, terminate=True)
+ # session.close (terminate=True)
#
# it will thus both clean out the session's database record, and kill
# all remaining pilots (none in our example).
diff --git a/examples/docs/simple_bot.py b/examples/docs/simple_bot.py
index 825b1d9d7e..5eda616f17 100755
--- a/examples/docs/simple_bot.py
+++ b/examples/docs/simple_bot.py
@@ -125,7 +125,7 @@
# the above is equivalent to
#
- # session.close (cleanup=True, terminate=True)
+ # session.close (terminate=True)
#
# it will thus both clean out the session's database record, and kill
# all remaining pilots.
diff --git a/examples/docs/simple_bot_mult_res.py b/examples/docs/simple_bot_mult_res.py
index 2d1bc298c2..c916d257c5 100644
--- a/examples/docs/simple_bot_mult_res.py
+++ b/examples/docs/simple_bot_mult_res.py
@@ -192,7 +192,7 @@ def task_state_cb (task, state):
# the above is equivalent to
#
- # session.close (cleanup=True, terminate=True)
+ # session.close (terminate=True)
#
# it will thus both clean out the session's database record, and kill
# all remaining pilots (none in our example).
diff --git a/examples/misc/backfilling.py b/examples/misc/backfilling.py
index 249c418c9d..7c7b34a172 100755
--- a/examples/misc/backfilling.py
+++ b/examples/misc/backfilling.py
@@ -150,7 +150,7 @@ def task_state_cb (task, state):
# the above is equivalent to
#
- # session.close (cleanup=True, terminate=True)
+ # session.close (terminate=True)
#
# it will thus both clean out the session's database record, and kill
# all remaining pilots (none in our example).
diff --git a/examples/misc/backfilling_recovery.py b/examples/misc/backfilling_recovery.py
index 234c8ebf6a..f8ee89c98d 100755
--- a/examples/misc/backfilling_recovery.py
+++ b/examples/misc/backfilling_recovery.py
@@ -179,7 +179,7 @@ def wait_queue_size_cb(tmgr, wait_queue_size):
# the above is equivalent to
#
- # session.close (cleanup=True, terminate=True)
+ # session.close (terminate=True)
#
# it will thus both clean out the session's database record, and kill
# all remaining pilots (none in our example).
diff --git a/examples/misc/ordered_pipelines.py b/examples/misc/ordered_pipelines.py
index acc029c1e6..b3161f402c 100755
--- a/examples/misc/ordered_pipelines.py
+++ b/examples/misc/ordered_pipelines.py
@@ -66,8 +66,8 @@
td.arguments = [p, s, t, 10]
td.ranks = 1
td.tags = {'order': {'ns' : p,
- 'order': s,
- 'size' : n_tasks}}
+ 'order': s,
+ 'size' : n_tasks}}
td.name = 'p%03d-s%03d-t%03d' % (p, s, t)
tds.append(td)
report.progress()
diff --git a/examples/misc/raptor.cfg b/examples/misc/raptor.cfg
index b996e183a8..0e9365411d 100644
--- a/examples/misc/raptor.cfg
+++ b/examples/misc/raptor.cfg
@@ -32,13 +32,13 @@
"master_descr": {
"mode" : "raptor.master",
- "named_env" : "ve_raptor",
+ "named_env" : "rp",
"executable" : "./raptor_master.py"
},
"worker_descr": {
"mode" : "raptor.worker",
- "named_env" : "ve_raptor",
+ "named_env" : "rp",
# custom worker class
"raptor_class" : "MyWorker",
diff --git a/examples/misc/raptor.py b/examples/misc/raptor.py
index 1a04cbf49d..1b1fccfec2 100755
--- a/examples/misc/raptor.py
+++ b/examples/misc/raptor.py
@@ -134,18 +134,6 @@ def task_state_cb(task, state):
'target': 'radical-pilot-hello.sh',
'action': rp.TRANSFER})
- # Issue an RPC to provision a Python virtual environment for the later
- # raptor tasks. Note that we are telling prepare_env to install
- # radical.pilot and radical.utils from sdist archives on the local
- # filesystem. This only works for the default resource, local.localhost.
- report.info('Call pilot.prepare_env()... ')
- pilot.prepare_env(env_name='ve_raptor',
- env_spec={'type' : 'venv',
- 'setup': [rp.sdist_path,
- ru.sdist_path,
- 'mpi4py']})
- report.info('done\n')
-
# Launch a raptor master task, which will launch workers and self-submit
# some additional tasks for illustration purposes.
@@ -161,7 +149,6 @@ def task_state_cb(task, state):
td.arguments = [cfg_file, i]
td.cpu_processes = 1
td.cpu_threads = cores_per_master
- td.named_env = 'rp'
td.input_staging = [{'source': '%s/raptor_master.py' % PWD,
'target': 'raptor_master.py',
'action': rp.TRANSFER,
diff --git a/examples/misc/raptor_master.py b/examples/misc/raptor_master.py
index 100866b251..9a04feafd0 100755
--- a/examples/misc/raptor_master.py
+++ b/examples/misc/raptor_master.py
@@ -65,7 +65,7 @@ class MyMaster(rp.raptor.Master):
# --------------------------------------------------------------------------
#
- def __init__(self, cfg):
+ def __init__(self, cfg: ru.Config):
self._cnt = 0
self._submitted = defaultdict(int)
@@ -73,9 +73,11 @@ def __init__(self, cfg):
# initialize the task overlay base class. That base class will ensure
# proper communication channels to the pilot agent.
+ ru.write_json('m1.json', cfg)
super().__init__(cfg=cfg)
+ ru.write_json('m2.json', self._cfg)
- self._sleep = self._cfg.sleep
+ self._sleep = self._raptor_cfg.sleep
# --------------------------------------------------------------------------
@@ -266,7 +268,6 @@ def result_cb(self, tasks):
cores_per_node = cfg.cores_per_node
gpus_per_node = cfg.gpus_per_node
descr = cfg.worker_descr
- pwd = os.getcwd()
# one node is used by master. Alternatively (and probably better), we could
# reduce one of the worker sizes by one core. But it somewhat depends on
@@ -280,7 +281,6 @@ def result_cb(self, tasks):
# insert `n` worker tasks into the agent. The agent will schedule (place)
# those workers and execute them. Insert one smaller worker (see above)
# NOTE: this assumes a certain worker size / layout
- out('workers: %d' % n_workers)
descr['ranks'] = nodes_per_worker * cores_per_node
descr['gpus_per_rank'] = nodes_per_worker * gpus_per_node
worker_ids = master.submit_workers(
@@ -292,19 +292,15 @@ def result_cb(self, tasks):
# FIXME
master.wait_workers(count=1)
- out('start')
master.start()
- out('submit')
master.submit()
out('stop')
# TODO: can be run from thread?
master.stop()
- out('join')
# TODO: worker state callback
master.join()
- out('done')
# TODO: expose RPC hooks
diff --git a/examples/misc/raptor_simple.py b/examples/misc/raptor_simple.py
index edcbff39de..d8ce99c327 100755
--- a/examples/misc/raptor_simple.py
+++ b/examples/misc/raptor_simple.py
@@ -41,9 +41,10 @@ def task_state_cb(task, state):
tmgr.wait_tasks(task.uid)
print('%s [%s]: %s' % (task.uid, task.state, task.stdout))
- raptor.rpc('stop')
- tmgr.wait_tasks(raptor.uid)
- print('%s [%s]: %s' % (raptor.uid, raptor.state, raptor.stdout))
+ # FIXME: MongoDB
+ # raptor.rpc('stop')
+ # tmgr.wait_tasks(raptor.uid)
+ # print('%s [%s]: %s' % (raptor.uid, raptor.state, raptor.stdout))
finally:
session.close(download=False)
diff --git a/requirements-ci.txt b/requirements-ci.txt
new file mode 100644
index 0000000000..bf123663fc
--- /dev/null
+++ b/requirements-ci.txt
@@ -0,0 +1,26 @@
+
+# provided branches for RCT components can be edited,
+# if there is a dependency with non-devel branch
+
+# default RCT branch for CI runs is "devel"
+
+radical.utils @ git+https://github.com/radical-cybertools/radical.utils@devel_nodb_2
+radical.saga @ git+https://github.com/radical-cybertools/radical.saga@devel
+radical.gtod @ git+https://github.com/radical-cybertools/radical.gtod@devel
+radical.analytics @ git+https://github.com/radical-cybertools/radical.analytics.git@devel
+
+# RP from the current branch
+.
+
+setproctitle
+dill
+
+pytest
+pytest-timeout
+pylint
+flake8
+flake8-per-file-ignores
+coverage
+psutil
+pudb
+
diff --git a/requirements-devel b/requirements-devel
deleted file mode 100644
index c68c03d58d..0000000000
--- a/requirements-devel
+++ /dev/null
@@ -1,7 +0,0 @@
-
--r requirements.txt
--r requirements-docs.txt
--r requirements-tests.txt
-
-jupyter
-
diff --git a/requirements-docs-ci.txt b/requirements-docs-ci.txt
new file mode 100644
index 0000000000..572291160b
--- /dev/null
+++ b/requirements-docs-ci.txt
@@ -0,0 +1,13 @@
+
+-r requirements-ci.txt
+
+sphinx==5.3.0
+ipython
+ipykernel
+nbsphinx==0.8.12
+python-dotenv[cli]
+sphinx_copybutton
+sphinx_rtd_theme>=0.5.1
+myst_parser
+docutils==0.17.1
+
diff --git a/requirements-docs.txt b/requirements-docs.txt
index 8c5331dfc1..24d33e983b 100644
--- a/requirements-docs.txt
+++ b/requirements-docs.txt
@@ -1,6 +1,8 @@
-r requirements.txt
+radical.pilot>=1.36
+
sphinx==5.3.0
ipython
ipykernel
@@ -8,7 +10,6 @@ nbsphinx==0.8.12
python-dotenv[cli]
sphinx_copybutton
sphinx_rtd_theme>=0.5.1
-radical.pilot>=1.14
myst_parser
docutils==0.17.1
diff --git a/requirements-tests.txt b/requirements-tests.txt
deleted file mode 100644
index 109932dafe..0000000000
--- a/requirements-tests.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-
--r requirements.txt
-
-pytest
-pytest-timeout
-pylint
-flake8
-flake8-per-file-ignores
-coverage
-psutil
-pudb
-
diff --git a/requirements.txt b/requirements.txt
index 1556c9f9e6..3a05a2fff5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,6 @@
-
-radical.utils>=1.12
+radical.utils @ git+https://github.com/radical-cybertools/radical.utils@devel_nodb_2
radical.saga>=1.12
radical.gtod
-pymongo<4
setproctitle
dill
diff --git a/setup.py b/setup.py
index ba33752e4f..9dbc49d139 100755
--- a/setup.py
+++ b/setup.py
@@ -251,7 +251,9 @@ def run(self):
'packages' : find_namespace_packages('src', include=['radical.*']),
'package_dir' : {'': 'src'},
'scripts' : [
- 'bin/radical-pilot-agent',
+ 'bin/radical-pilot-agent_0',
+ 'bin/radical-pilot-agent_n',
+ # 'bin/radical-pilot-agent-bridge',
'bin/radical-pilot-agent-statepush',
'bin/radical-pilot-bridge',
'bin/radical-pilot-bson2json',
@@ -272,6 +274,7 @@ def run(self):
'bin/radical-pilot-raptor-worker',
'bin/radical-pilot-resources',
'bin/radical-pilot-run-session',
+ 'bin/radical-pilot-service-signal',
'bin/radical-pilot-stats',
'bin/radical-pilot-stats.plot',
'bin/radical-pilot-ve',
diff --git a/src/radical/pilot/__init__.py b/src/radical/pilot/__init__.py
index 34f123abc3..e5bfbb262f 100644
--- a/src/radical/pilot/__init__.py
+++ b/src/radical/pilot/__init__.py
@@ -18,6 +18,7 @@
# import API
from .session import Session
from .context import Context
+from .proxy import Proxy
from .task_manager import TaskManager
from .task import Task
diff --git a/src/radical/pilot/agent/agent_0.py b/src/radical/pilot/agent/agent_0.py
index f6b2b658d3..c2d2fa84d6 100644
--- a/src/radical/pilot/agent/agent_0.py
+++ b/src/radical/pilot/agent/agent_0.py
@@ -4,9 +4,9 @@
import copy
import os
-import pprint
import stat
import time
+import pprint
import threading as mt
@@ -15,9 +15,9 @@
from .. import utils as rpu
from .. import states as rps
from .. import constants as rpc
+from .. import TaskDescription
from .. import Session
from .. import TaskDescription, AGENT_SERVICE
-from ..db import DBSession
from .resource_manager import ResourceManager
@@ -31,47 +31,38 @@ class Agent_0(rpu.Worker):
the sub-agents die, it will shut down the other sub-agents and itself.
This class inherits the rpu.Worker, so that it can use its communication
- bridges and callback mechanisms. Specifically, it will pull the DB for
- new tasks to be executed and forwards them to the agent's component
- network (see `work()`). It will also watch the DB for any commands to be
- forwarded (pilot termination, task cancellation, etc.), and will take care
- of heartbeat messages to be sent to the client module. To do all this, it
- initializes a DB connection in `initialize()`.
+ bridges and callback mechanisms.
'''
# --------------------------------------------------------------------------
#
- def __init__(self, cfg: ru.Config, session: Session):
+ def __init__(self):
+
+ cfg = ru.Config(path='./agent_0.cfg')
- self._uid = 'agent.0'
- self._cfg = cfg
+ self._uid = cfg.uid
self._pid = cfg.pid
self._sid = cfg.sid
+ self._owner = cfg.owner
self._pmgr = cfg.pmgr
self._pwd = cfg.pilot_sandbox
- self._session = session
- self._log = ru.Logger(self._uid, ns='radical.pilot')
+ self._session = Session(uid=cfg.sid, cfg=cfg, _role=Session._AGENT_0)
+
+ # init the worker / component base classes, connects registry
+ rpu.Worker.__init__(self, cfg, self._session)
+
self._starttime = time.time()
self._final_cause = None
+ # keep some state about service startups
+ self._service_uids_launched = list()
+ self._service_uids_running = list()
+ self._services_setup = mt.Event()
+
# this is the earliest point to sync bootstrap and agent profiles
- self._prof = ru.Profiler(ns='radical.pilot', name=self._uid)
self._prof.prof('hostname', uid=cfg.pid, msg=ru.get_hostname())
- # run an inline registry service to share runtime config with other
- # agents and components
- reg_uid = 'radical.pilot.reg.%s' % self._uid
- self._reg_service = ru.zmq.Registry(uid=reg_uid)
- self._reg_service.start()
- self._reg_addr = self._reg_service.addr
-
- # let all components know where to look for the registry
- self._cfg['reg_addr'] = self._reg_addr
-
- # connect to MongoDB for state push/pull
- self._connect_db()
-
# configure ResourceManager before component startup, as components need
# ResourceManager information for function (scheduler, executor)
self._configure_rm()
@@ -79,85 +70,65 @@ def __init__(self, cfg: ru.Config, session: Session):
# ensure that app communication channels are visible to workload
self._configure_app_comm()
- # expose heartbeat channel to sub-agents, bridges and components,
- # and start those
- self._cmgr = rpu.ComponentManager(self._cfg)
- self._cfg.heartbeat = self._cmgr.cfg.heartbeat
-
- self._cmgr.start_bridges()
- self._cmgr.start_components()
-
- # service tasks uids, which were launched
- self._service_uids_launched = list()
- # service tasks uids, which were confirmed to be started
- self._service_uids_running = list()
- # set flag when all services are running
- self._services_setup = mt.Event()
-
# create the sub-agent configs and start the sub agents
self._write_sa_configs()
self._start_sub_agents() # TODO: move to cmgr?
- # at this point the session is up and connected, and it should have
- # brought up all communication bridges and components. We are
- # ready to rumble!
- rpu.Worker.__init__(self, self._cfg, session)
+ # regularly check for lifetime limit
+ self.register_timed_cb(self._check_lifetime, timer=10)
- self.register_subscriber(rpc.CONTROL_PUBSUB, self._check_control)
- self.register_subscriber(rpc.STATE_PUBSUB, self._service_state_cb)
- # run our own slow-paced heartbeat monitor to watch pmgr heartbeats
- # FIXME: we need to get pmgr freq
- freq = 60
- tint = freq / 3
- tout = freq * 10
- self._hb = ru.Heartbeat(uid=self._uid,
- timeout=tout,
- interval=tint,
- beat_cb=self._hb_check, # no own heartbeat(pmgr pulls)
- term_cb=self._hb_term_cb,
- log=self._log)
- self._hb.start()
+ # --------------------------------------------------------------------------
+ #
+ def _proxy_input_cb(self, msg):
- # register pmgr heartbeat
- self._log.info('hb init for %s', self._pmgr)
- self._hb.beat(uid=self._pmgr)
+ self._log.debug_8('proxy input cb: %s', len(msg))
+ to_advance = list()
- # --------------------------------------------------------------------------
- #
- def _hb_check(self):
+ for task in msg:
+
+ # make sure the tasks obtain env settings (if needed)
+ if 'task_environment' in self.session.rcfg:
+
+ if not task['description'].get('environment'):
+ task['description']['environment'] = dict()
- self._log.debug('hb check')
+ for k,v in self.session.rcfg.task_environment.items():
+ # FIXME: this might overwrite user specified env
+ task['description']['environment'][k] = v
+
+ # FIXME: raise or fail task!
+ if task['state'] != rps.AGENT_STAGING_INPUT_PENDING:
+ self._log.error('invalid state: %s:%s:%s', task['uid'],
+ task['state'], task.get('states'))
+ continue
+
+ to_advance.append(task)
+
+ # now we really own the tasks and can start working on them (ie. push
+ # them into the pipeline). We don't publish nor profile as advance,
+ # since the state transition happened already on the client side when
+ # the state was set.
+ self.advance(to_advance, publish=False, push=True)
# --------------------------------------------------------------------------
#
- def _hb_term_cb(self, msg=None):
-
- self._cmgr.close()
- self._log.warn('hb termination: %s', msg)
+ def _proxy_output_cb(self, msg):
- return None
+ # we just forward the tasks to the task proxy queue
+ self._log.debug('proxy output cb: %s', len(msg))
+ self.advance(msg, publish=False, push=True, qname=self._sid)
# --------------------------------------------------------------------------
#
- def _connect_db(self):
-
- # Check for the RADICAL_PILOT_DB_HOSTPORT env var, which will hold
- # the address of the tunnelized DB endpoint. If it exists, we
- # overrule the agent config with it.
- hostport = os.environ.get('RADICAL_PILOT_DB_HOSTPORT')
- if hostport:
- host, port = hostport.split(':', 1)
- dburl = ru.Url(self._cfg.dburl)
- dburl.host = host
- dburl.port = port
- self._cfg.dburl = str(dburl)
-
- self._dbs = DBSession(sid=self._cfg.sid, dburl=self._cfg.dburl,
- cfg=self._cfg, log=self._log)
+ def _client_ctrl_cb(self, topic, msg):
+
+ self._log.debug('ctl sub cb: %s %s', topic, msg)
+ ## FIXME?
+
# --------------------------------------------------------------------------
#
@@ -167,9 +138,11 @@ def _configure_rm(self):
# use for sub-agent startup. Add the remaining ResourceManager
# information to the config, for the benefit of the scheduler).
- self._rm = ResourceManager.create(name=self._cfg.resource_manager,
- cfg=self._cfg, log=self._log,
- prof=self._prof)
+ rname = self.session.rcfg.resource_manager
+ self._rm = ResourceManager.create(name=rname,
+ cfg=self.session.cfg,
+ rcfg=self.session.rcfg,
+ log=self._log, prof=self._prof)
self._log.debug(pprint.pformat(self._rm.info))
@@ -182,40 +155,56 @@ def _configure_app_comm(self):
# channels, merge those into the agent config
#
# FIXME: this needs to start the app_comm bridges
- app_comm = self._cfg.get('app_comm')
+ app_comm = self.session.rcfg.get('app_comm')
if app_comm:
+
+ # bridge addresses also need to be exposed to the workload
+ if 'task_environment' not in self.session.rcfg:
+ self.session.rcfg['task_environment'] = dict()
+
if isinstance(app_comm, list):
app_comm = {ac: {'bulk_size': 0,
'stall_hwm': 1,
'log_level': 'error'} for ac in app_comm}
for ac in app_comm:
- if ac in self._cfg['bridges']:
+
+ if ac in self._reg['bridges']:
raise ValueError('reserved app_comm name %s' % ac)
- self._cfg['bridges'][ac] = app_comm[ac]
+ self._reg['bridges.%s' % ac] = app_comm[ac]
- # some of the bridge addresses also need to be exposed to the workload
- if app_comm:
- if 'task_environment' not in self._cfg:
- self._cfg['task_environment'] = dict()
- for ac in app_comm:
- if ac not in self._cfg['bridges']:
- raise RuntimeError('missing app_comm %s' % ac)
- self._cfg['task_environment']['RP_%s_IN' % ac.upper()] = \
- self._cfg['bridges'][ac]['addr_in']
- self._cfg['task_environment']['RP_%s_OUT' % ac.upper()] = \
- self._cfg['bridges'][ac]['addr_out']
+ AC = ac.upper()
+
+ self.session.rcfg.task_environment['RP_%s_IN' % AC] = ac['addr_in']
+ self.session.rcfg.task_environment['RP_%s_OUT' % AC] = ac['addr_out']
# --------------------------------------------------------------------------
#
def initialize(self):
- # registers the staging_input_queue as this is what we want to push
- # tasks to
+ # listen for new tasks from the client
+ self.register_input(rps.AGENT_STAGING_INPUT_PENDING,
+ rpc.PROXY_TASK_QUEUE,
+ qname=self._pid,
+ cb=self._proxy_input_cb)
+
+ # and forward to agent input staging
self.register_output(rps.AGENT_STAGING_INPUT_PENDING,
rpc.AGENT_STAGING_INPUT_QUEUE)
+ # listen for completed tasks to forward to client
+ self.register_input(rps.TMGR_STAGING_OUTPUT_PENDING,
+ rpc.AGENT_COLLECTING_QUEUE,
+ cb=self._proxy_output_cb)
+
+ # and register output
+ self.register_output(rps.TMGR_STAGING_OUTPUT_PENDING,
+ rpc.PROXY_TASK_QUEUE)
+
+ self.register_rpc_handler('prepare_env', self._prepare_env,
+ addr=self._pid)
+
# before we run any tasks, prepare a named_env `rp` for tasks which use
# the pilot's own environment, such as raptors
env_spec = {'type' : os.environ['RP_VENV_TYPE'],
@@ -225,29 +214,28 @@ def initialize(self):
'export PATH=%s'
% os.environ.get('PATH', '')]
}
- self._prepare_env('rp', env_spec)
-
- # register the command callback which pulls the DB for commands
- self.register_timed_cb(self._agent_control_cb,
- timer=self._cfg['db_poll_sleeptime'])
-
- # register idle callback to pull for tasks
- self.register_timed_cb(self._check_tasks_cb,
- timer=self._cfg['db_poll_sleeptime'])
+ self.rpc('prepare_env', env_name='rp', env_spec=env_spec,
+ addr=self._pid)
+ # start any services if they are requested
self._start_services()
# sub-agents are started, components are started, bridges are up: we are
- # ready to roll! Update pilot state.
- pilot = {'type' : 'pilot',
- 'uid' : self._pid,
- 'state' : rps.PMGR_ACTIVE,
- 'resource_details' : {
- # 'lm_info' : self._rm.lm_info.get('version_info'),
- # 'lm_detail' : self._rm.lm_info.get('lm_detail'),
- 'rm_info' : self._rm.info},
- '$set' : ['resource_details']}
- self.advance(pilot, publish=True, push=False)
+ # ready to roll! Send state update
+ rm_info = self._rm.info
+ n_nodes = len(rm_info['node_list'])
+
+ self._log.debug('advance to PMGR_ACTIVE')
+
+ pilot = {'$all' : True, # pass full info to client side
+ 'type' : 'pilot',
+ 'uid' : self._pid,
+ 'state' : rps.PMGR_ACTIVE,
+ 'resources': {'rm_info': rm_info,
+ 'cpu' : rm_info['cores_per_node'] * n_nodes,
+ 'gpu' : rm_info['gpus_per_node'] * n_nodes}}
+
+ self.advance(pilot, publish=True, push=False, fwd=True)
# --------------------------------------------------------------------------
@@ -283,14 +271,16 @@ def finalize(self):
self._log.debug('stage output parent')
self.stage_output()
- # tear things down in reverse order
- self._hb.stop()
- self._cmgr.close()
+ self._log.info('rusage: %s', rpu.get_rusage())
- if self._rm:
- self._rm.stop()
+ out, err, log = '', '', ''
- self._reg_service.stop()
+ try : out = open('./agent_0.out', 'r').read(1024)
+ except: pass
+ try : err = open('./agent_0.err', 'r').read(1024)
+ except: pass
+ try : log = open('./agent_0.log', 'r').read(1024)
+ except: pass
if self._final_cause == 'timeout' : state = rps.DONE
elif self._final_cause == 'cancel' : state = rps.CANCELED
@@ -302,29 +292,20 @@ def finalize(self):
with ru.ru_open('./killme.signal', 'w') as fout:
fout.write('%s\n' % state)
- # we don't rely on the existence / viability of the update worker at
- # that point.
- self._log.debug('update db state: %s: %s', state, self._final_cause)
- self._log.info('rusage: %s', rpu.get_rusage())
+ pilot = {'type' : 'pilot',
+ 'uid' : self._pid,
+ 'stdout' : out,
+ 'stderr' : err,
+ 'logfile': log,
+ 'state' : state}
- out, err, log = '', '', ''
-
- try : out = ru.ru_open('./agent.0.out', 'r').read(1024)
- except: pass
- try : err = ru.ru_open('./agent.0.err', 'r').read(1024)
- except: pass
- try : log = ru.ru_open('./agent.0.log', 'r').read(1024)
- except: pass
+ self._log.debug('push final state update')
+ self._log.debug('update state: %s: %s', state, self._final_cause)
+ self.advance(pilot, publish=True, push=False)
- ret = self._dbs._c.update({'type' : 'pilot',
- 'uid' : self._pid},
- {'$set' : {'stdout' : rpu.tail(out),
- 'stderr' : rpu.tail(err),
- 'logfile': rpu.tail(log),
- 'state' : state},
- '$push': {'states' : state}
- })
- self._log.debug('update ret: %s', ret)
+ # tear things down in reverse order
+ self._rm.stop()
+ self._session.close()
# --------------------------------------------------------------------
@@ -334,41 +315,40 @@ def _write_sa_configs(self):
# we have all information needed by the subagents -- write the
# sub-agent config files.
- # write deep-copies of the config for each sub-agent (sans from agent.0)
- for sa in self._cfg.get('agents', {}):
+ # write deep-copies of the config for each sub-agent (sans from agent_0)
+ for sa in self.session.cfg.get('agents', {}):
- assert (sa != 'agent.0'), 'expect subagent, not agent.0'
+ assert (sa != 'agent_0'), 'expect subagent, not agent_0'
# use our own config sans agents/components/bridges as a basis for
# the sub-agent config.
- tmp_cfg = copy.deepcopy(self._cfg)
+ tmp_cfg = copy.deepcopy(self.session.cfg)
tmp_cfg['agents'] = dict()
tmp_cfg['components'] = dict()
tmp_cfg['bridges'] = dict()
# merge sub_agent layout into the config
- ru.dict_merge(tmp_cfg, self._cfg['agents'][sa], ru.OVERWRITE)
+ ru.dict_merge(tmp_cfg, self.session.cfg['agents'][sa], ru.OVERWRITE)
tmp_cfg['uid'] = sa
tmp_cfg['aid'] = sa
- tmp_cfg['owner'] = 'agent.0'
-
- ru.write_json(tmp_cfg, './%s.cfg' % sa)
+ tmp_cfg['owner'] = 'agent_0'
# --------------------------------------------------------------------------
#
def _start_services(self):
- service_descriptions = self._cfg.services
- if not service_descriptions:
+ sds = self._cfg.services
+ if not sds:
return
+
self._log.info('starting agent services')
services = list()
- for service_desc in service_descriptions:
+ for sd in sds:
- td = TaskDescription(service_desc)
+ td = TaskDescription(sd)
td.mode = AGENT_SERVICE
# ensure that the description is viable
td.verify()
@@ -376,7 +356,6 @@ def _start_services(self):
cfg = self._cfg
tid = ru.generate_id('service.%(item_counter)04d',
ru.ID_CUSTOM, ns=self._cfg.sid)
-
task = dict()
task['origin'] = 'agent'
task['description'] = td.as_dict()
@@ -396,6 +375,9 @@ def _start_services(self):
self._service_uids_launched.append(tid)
services.append(task)
+ self._log.debug('start service %s: %s', tid, sd)
+
+
self.advance(services, publish=False, push=True)
# Waiting 2mins for all services to launch
@@ -405,38 +387,6 @@ def _start_services(self):
self._log.info('all agent services started')
- # --------------------------------------------------------------------------
- #
- def _service_state_cb(self, topic, msg): # pylint: disable=unused-argument
-
- cmd = msg['cmd']
- tasks = msg['arg']
-
- if cmd != 'update':
- return
-
- for service in ru.as_list(tasks):
-
- if service['uid'] not in self._service_uids_launched or \
- service['uid'] in self._service_uids_running:
- continue
-
- self._log.debug('service state update %s: %s',
- service['uid'], service['state'])
-
- if service['state'] != rps.AGENT_EXECUTING:
- continue
-
- self._service_uids_running.append(service['uid'])
- self._log.debug('service %s started (%s / %s)', service['uid'],
- len(self._service_uids_running),
- len(self._service_uids_launched))
-
- if len(self._service_uids_launched) == \
- len(self._service_uids_running):
- self._services_setup.set()
-
-
# --------------------------------------------------------------------------
#
def _start_sub_agents(self):
@@ -448,10 +398,14 @@ def _start_sub_agents(self):
# FIXME: reroute to agent daemonizer
- if not self._cfg.get('agents'):
+ if not self.session.cfg.get('agents'):
return
- assert (len(self._rm.info.agent_node_list) >= len(self._cfg['agents']))
+ n_agents = len(self.session.cfg['agents'])
+ n_agent_nodes = len(self._rm.info.agent_node_list)
+
+ assert n_agent_nodes >= n_agents
+
self._log.debug('start_sub_agents')
@@ -460,13 +414,14 @@ def _start_sub_agents(self):
# the configs are written, and the sub-agents can be started. To know
# how to do that we create the agent launch method, have it creating
- # the respective command lines per agent instance, and run via
- # popen.
- #
+ # the respective command lines per agent instance, and run via popen.
- for idx, sa in enumerate(self._cfg['agents']):
+ bs_name = '%s/bootstrap_2.sh'
- target = self._cfg['agents'][sa]['target']
+ for idx, sa in enumerate(self.session.cfg['agents']):
+
+ target = self.session.cfg['agents'][sa]['target']
+ bs_args = [self._sid, self.session.cfg.reg_addr, sa]
if target not in ['local', 'node']:
@@ -475,8 +430,8 @@ def _start_sub_agents(self):
if target == 'local':
# start agent locally
- cmdline = '/bin/sh -l %s/bootstrap_2.sh %s' % (self._pwd, sa)
-
+ bs_path = bs_name % self._pwd
+ cmdline = '/bin/sh -l %s' % ' '.join([bs_path] + bs_args)
else: # target == 'node':
@@ -491,7 +446,7 @@ def _start_sub_agents(self):
# out for the moment, which will make this unable to
# work with a number of launch methods. Can the
# offset computation be moved to the ResourceManager?
- bs_name = '%s/bootstrap_2.sh' % (self._pwd)
+
launch_script = '%s/%s.launch.sh' % (self._pwd, sa)
exec_script = '%s/%s.exec.sh' % (self._pwd, sa)
@@ -503,7 +458,7 @@ def _start_sub_agents(self):
'ranks' : 1,
'cores_per_rank': self._rm.info.cores_per_node,
'executable' : '/bin/sh',
- 'arguments' : [bs_name, sa]
+ 'arguments' : [bs_name % self._pwd] + bs_args
}).as_dict(),
'slots': {'ranks' : [{'node_name': node['node_name'],
'node_id' : node['node_id'],
@@ -535,7 +490,7 @@ def _start_sub_agents(self):
tmp = '#!/bin/sh\n\n'
tmp += '. ./env/agent.env\n'
- tmp += '/bin/sh -l ./bootstrap_2.sh %s\n\n' % sa
+ tmp += '/bin/sh -l %s\n\n' % ' '.join([bs_name % '.'] + bs_args)
with ru.ru_open(exec_script, 'w') as fout:
fout.write(tmp)
@@ -560,254 +515,99 @@ def _start_sub_agents(self):
# --------------------------------------------------------------------------
#
- def _agent_control_cb(self):
-
- if not self._check_commands(): return False
- if not self._check_rpc (): return False
- if not self._check_state (): return False
-
- return True
-
-
- # --------------------------------------------------------------------------
- #
- def _check_commands(self):
-
- # Check if there's a command waiting
- # FIXME: this pull should be done by the update worker, and commands
- # should then be communicated over the command pubsub
- # FIXME: commands go to pmgr, tmgr, session docs
- # FIXME: check if pull/wipe are atomic
- # FIXME: long runnign commands can time out on hb
- retdoc = self._dbs._c.find_and_modify(
- query ={'uid' : self._pid},
- fields=['cmds'], # get new commands
- update={'$set': {'cmds': list()}}) # wipe old commands
-
- if not retdoc:
- return True
+ def _check_lifetime(self):
- for spec in retdoc.get('cmds', []):
-
- cmd = spec['cmd']
- arg = spec['arg']
-
- self._log.debug('pilot command: %s: %s', cmd, arg)
- self._prof.prof('cmd', msg="%s : %s" % (cmd, arg), uid=self._pid)
+ # Make sure that we haven't exceeded the runtime - otherwise terminate.
+ if self.session.cfg.runtime:
- if cmd == 'heartbeat' and arg['pmgr'] == self._pmgr:
- self._hb.beat(uid=self._pmgr)
+ if time.time() >= self._starttime + \
+ (int(self.session.cfg.runtime) * 60):
- elif cmd == 'cancel_pilot':
- self._log.info('cancel_pilot cmd')
- self.publish(rpc.CONTROL_PUBSUB, {'cmd' : 'terminate',
- 'arg' : None})
- self._final_cause = 'cancel'
+ self._log.info('runtime limit (%ss).',
+ self.session.cfg.runtime * 60)
+ self._final_cause = 'timeout'
+ self.stop()
return False # we are done
- elif cmd == 'cancel_tasks':
- self._log.info('cancel_tasks cmd')
- self.publish(rpc.CONTROL_PUBSUB, {'cmd' : 'cancel_tasks',
- 'arg' : arg})
- else:
- self._log.warn('could not interpret cmd "%s" - ignore', cmd)
-
return True
# --------------------------------------------------------------------------
#
- def _check_rpc(self):
+ def control_cb(self, topic, msg):
'''
- check if the DB has any RPC request for this pilot. If so, then forward
- that request as `rpc_req` command on the CONTROL channel, and listen for
- an `rpc_res` command on the same channel, for the same rpc id. Once
- that response is received (from whatever component handled that
- command), send the response back to the databse for the callee to pick
- up.
+ Check for commands on the control pubsub, mainly waiting for RPC
+ requests to handle.
'''
- # FIXME: implement a timeout, and/or a registry of rpc clients
+ self._log.debug_1('control msg %s: %s', topic, msg)
- retdoc = self._dbs._c.find_and_modify(
- query ={'uid' : self._pid},
- fields=['rpc_req'],
- update={'$set': {'rpc_req': None}})
+ cmd = msg['cmd']
+ arg = msg.get('arg')
- if not retdoc:
- # no rpc request found
- return True
+ self._log.debug('pilot command: %s: %s', cmd, arg)
+ self._prof.prof('cmd', msg="%s : %s" % (cmd, arg), uid=self._pid)
- rpc_req = retdoc.get('rpc_req')
- if rpc_req is None:
- # document has no rpc request
+ if cmd == 'pmgr_heartbeat' and arg['pmgr'] == self._pmgr:
+ self._session._hb.beat(uid=self._pmgr)
return True
- self._log.debug('rpc req: %s', rpc_req)
-
- # RPCs are synchronous right now - we send the RPC on the command
- # channel, hope that some component picks it up and replies, and then
- # return that reply. The reply is received via a temporary callback
- # defined here, which will receive all CONTROL messages until the right
- # rpc response comes along.
- def rpc_cb(topic, msg):
-
- rpc_id = rpc_req['uid']
-
- cmd = msg['cmd']
- rpc_res = msg['arg']
-
- if cmd != 'rpc_res':
- # not an rpc responese, keep cb registered
- return True
-
- if rpc_res['uid'] != rpc_id:
- # not the right rpc response, keep cb registered
- return True
+ elif cmd == 'cancel_pilots':
+ return self._ctrl_cancel_pilots(msg)
- # send the response to the DB
- self._dbs._c.update({'type' : 'pilot',
- 'uid' : self._pid},
- {'$set' : {'rpc_res': rpc_res}})
-
- # work is done - unregister this temporary cb
- return False
-
-
- self.register_subscriber(rpc.CONTROL_PUBSUB, rpc_cb)
-
- # ready to receive and proxy rpc response -- forward rpc request on
- # control channel
- self.publish(rpc.CONTROL_PUBSUB, {'cmd' : 'rpc_req',
- 'arg' : rpc_req})
-
- return True # keeb cb registered (self._check_rpc)
+ elif cmd == 'service_up':
+ return self._ctrl_service_up(msg)
# --------------------------------------------------------------------------
#
- def _check_control(self, _, msg):
- '''
- Check for commands on the control pubsub, mainly waiting for RPC
- requests to handle. We handle two types of RPC requests: `hello` for
- testing, and `prepare_env` for environment preparation requests.
- '''
+ def _ctrl_cancel_pilots(self, msg):
- cmd = msg['cmd']
arg = msg['arg']
- if cmd != 'rpc_req':
- # not an rpc request
- return True
+ if self._pid not in arg.get('uids'):
+ self._log.debug('ignore cancel %s', msg)
- req = arg['rpc']
- if req not in ['hello', 'prepare_env']:
- # we don't handle that request
- return True
+ self._log.info('cancel pilot cmd')
+ self.publish(rpc.CONTROL_PUBSUB, {'cmd' : 'terminate',
+ 'arg' : None})
+ self._final_cause = 'cancel'
+ self.stop()
- rpc_res = {'uid': arg['uid']}
- try:
- if req == 'hello' :
- out = 'hello %s' % ' '.join(arg['arg'])
-
- elif req == 'prepare_env':
- env_name = arg['arg']['env_name']
- env_spec = arg['arg']['env_spec']
- out = self._prepare_env(env_name, env_spec)
-
- else:
- # unknown command
- return True
-
- # request succeeded - respond with return value
- rpc_res['err'] = None
- rpc_res['out'] = out
- rpc_res['ret'] = 0
-
- except Exception as e:
- # request failed for some reason - indicate error
- rpc_res['err'] = repr(e)
- rpc_res['out'] = None
- rpc_res['ret'] = 1
- self._log.exception('control cmd failed')
-
- # publish the response (success or failure)
- self.publish(rpc.CONTROL_PUBSUB, {'cmd': 'rpc_res',
- 'arg': rpc_res})
- return True
+ # work is done - unregister this cb
+ return False
# --------------------------------------------------------------------------
#
- def _check_state(self):
+ def _ctrl_service_up(self, msg):
- # Make sure that we haven't exceeded the runtime - otherwise terminate.
- if self._cfg.runtime:
+ uid = msg['arg']['uid']
- if time.time() >= self._starttime + (int(self._cfg.runtime) * 60):
-
- self._log.info('runtime limit (%ss).', self._cfg.runtime * 60)
- self._final_cause = 'timeout'
- self.stop()
- return False # we are done
-
- return True
+ # This message signals that an agent service instance is up and running.
+ # We expect to find the service UID in args and can then unblock the
+ # service startup wait for that uid
-
- # --------------------------------------------------------------------------
- #
- def _check_tasks_cb(self):
-
- # Check for tasks waiting for input staging and log pull.
- #
- # FIXME: Unfortunately, 'find_and_modify' is not bulkable, so we have
- # to use 'find'. To avoid finding the same tasks over and over
- # again, we update the 'control' field *before* running the next
- # find -- so we do it right here.
- # This also blocks us from using multiple ingest threads, or from
- # doing late binding by task pull :/
- task_cursor = self._dbs._c.find({'type' : 'task',
- 'pilot' : self._pid,
- 'control' : 'agent_pending'})
- if not task_cursor.count():
- self._log.info('tasks pulled: 0')
+ if uid not in self._service_uids_launched:
+ # we do not know this service instance
+ self._log.warn('ignore service startup signal for %s', uid)
return True
- # update the tasks to avoid pulling them again next time.
- task_list = list(task_cursor)
- task_uids = [task['uid'] for task in task_list]
-
- self._dbs._c.update({'type' : 'task',
- 'uid' : {'$in' : task_uids}},
- {'$set' : {'control' : 'agent'}},
- multi=True)
-
- self._log.info("tasks pulled: %4d", len(task_list))
- self._prof.prof('get', msg='bulk: %d' % len(task_list), uid=self._pid)
-
- for task in task_list:
-
- # make sure the tasks obtain env settings (if needed)
- if 'task_environment' in self._cfg:
- if not task['description'].get('environment'):
- task['description']['environment'] = dict()
- for k,v in self._cfg['task_environment'].items():
- task['description']['environment'][k] = v
-
- # we need to make sure to have the correct state:
- task['state'] = rps._task_state_collapse(task['states'])
- self._prof.prof('get', uid=task['uid'])
+ if uid in self._service_uids_running:
+ self._log.warn('duplicated service startup signal for %s', uid)
+ return True
- # FIXME: raise or fail task!
- if task['state'] != rps.AGENT_STAGING_INPUT_PENDING:
- self._log.error('invalid state: %s', (pprint.pformat(task)))
+ self._log.debug('service startup message for %s', uid)
- task['control'] = 'agent'
+ self._service_uids_running.append(uid)
+ self._log.debug('service %s started (%s / %s)', uid,
+ len(self._service_uids_running),
+ len(self._service_uids_launched))
- # now we really own the CUs, and can start working on them (ie. push
- # them into the pipeline). We don't publish nor profile as advance,
- # since that happened already on the module side when the state was set.
- self.advance(task_list, publish=False, push=True)
+ # signal main thread when all services are up
+ if len(self._service_uids_launched) == \
+ len(self._service_uids_running):
+ self._services_setup.set()
return True
@@ -816,7 +616,7 @@ def _check_tasks_cb(self):
#
def _prepare_env(self, env_name, env_spec):
- self._log.debug('env_spec: %s', env_spec)
+ self._log.debug('env_spec %s: %s', env_name, env_spec)
etype = env_spec.get('type', 'venv')
evers = env_spec.get('version')
diff --git a/src/radical/pilot/agent/agent_n.py b/src/radical/pilot/agent/agent_n.py
index 4de6b2c04d..ba05f1c229 100644
--- a/src/radical/pilot/agent/agent_n.py
+++ b/src/radical/pilot/agent/agent_n.py
@@ -5,7 +5,9 @@
import time
import radical.utils as ru
-from .. import utils as rpu
+from .. import utils as rpu
+
+from .. import Session
# ------------------------------------------------------------------------------
@@ -18,19 +20,21 @@ class Agent_n(rpu.Worker):
# --------------------------------------------------------------------------
#
- def __init__(self, cfg, session):
+ def __init__(self, cfg: ru.Config, session):
self._cfg = cfg
+ self._sid = cfg.sid
self._pid = cfg.pid
self._pmgr = cfg.pmgr
self._pwd = cfg.pilot_sandbox
self._sid = cfg.sid
self._reg_addr = cfg.reg_addr
+ self._session = session
+
# log / profile via session until component manager is initialized
- self._session = session
- self._log = session._log
- self._prof = session._prof
+ self._log = self._session._log
+ self._prof = self._session._prof
self._starttime = time.time()
self._final_cause = None
@@ -39,18 +43,10 @@ def __init__(self, cfg, session):
self._prof.prof('hostname', uid=self._pid, msg=ru.get_hostname())
self._prof.prof('sub_agent_start', uid=self._pid)
- # expose heartbeat channel to sub-agents, bridges and components,
- # and start those
- self._cmgr = rpu.ComponentManager(self._cfg)
- self._cfg.heartbeat = self._cmgr.cfg.heartbeat
-
- self._cmgr.start_bridges()
- self._cmgr.start_components()
-
# at this point the session is up and connected, and it should have
# brought up all communication bridges and components. We are
# ready to rumble!
- rpu.Worker.__init__(self, self._cfg, session)
+ rpu.Worker.__init__(self, self._cfg, self._session)
# --------------------------------------------------------------------------
diff --git a/src/radical/pilot/agent/bootstrap_0.sh b/src/radical/pilot/agent/bootstrap_0.sh
index 7875eabe04..c93adb1cfb 100755
--- a/src/radical/pilot/agent/bootstrap_0.sh
+++ b/src/radical/pilot/agent/bootstrap_0.sh
@@ -90,7 +90,7 @@ SDISTS=
RUNTIME=
VIRTENV=
VIRTENV_MODE=
-CCM=
+LAUNCHER=
PILOT_ID=
RP_VERSION=
PYTHON=
@@ -220,7 +220,7 @@ create_gtod()
| cut -f1 -d' ')
printf "%.6f,%s,%s,%s,%s,%s,%s\n" \
"$now" "sync_abs" "bootstrap_0" "MainThread" "$PILOT_ID" \
- "PMGR_ACTIVE_PENDING" "$(hostname):$ip:$now:$now:$now" \
+ "$pilot_state" "$(hostname):$ip:$now:$now:$now" \
| tee -a "$PROFILE"
}
@@ -268,7 +268,7 @@ profile_event()
# MSG = 6 # message describing the event optional
# ENTITY = 7 # type of entity involved optional
printf "%.6f,%s,%s,%s,%s,%s,%s\n" \
- "$now" "$event" "bootstrap_0" "MainThread" "$PILOT_ID" "pilot_state" "$msg" \
+ "$now" "$event" "bootstrap_0" "MainThread" "$PILOT_ID" "$pilot_state" "$msg" \
>> "$PROFILE"
}
@@ -1448,16 +1448,6 @@ $cmd"
}
-# -------------------------------------------------------------------------------
-#
-# Build the PREBOOTSTRAP2 variable to pass down to sub-agents
-#
-add_services()
-{
- echo "$* &" >> ./services
-}
-
-
# -------------------------------------------------------------------------------
#
# untar the pilot sandbox
@@ -1500,17 +1490,16 @@ untar()
#
# NOTE: -z makes some assumptions on sandbox and tarball location
#
-while getopts "a:b:cd:e:f:h:i:j:m:p:r:s:t:v:w:x:y:z:" OPTION; do
+while getopts "a:b:cd:e:f:h:i:m:p:r:s:t:v:w:x:y:z:" OPTION; do
case $OPTION in
a) SESSION_SANDBOX="$OPTARG" ;;
b) PYTHON_DIST="$OPTARG" ;;
- c) CCM='TRUE' ;;
+ c) LAUNCHER='ccmrun' ;;
d) SDISTS="$OPTARG" ;;
e) pre_bootstrap_0 "$OPTARG" ;;
f) FORWARD_TUNNEL_ENDPOINT="$OPTARG" ;;
h) HOSTPORT="$OPTARG" ;;
i) PYTHON="$OPTARG" ;;
- j) add_services "$OPTARG" ;;
m) VIRTENV_MODE="$OPTARG" ;;
p) PILOT_ID="$OPTARG" ;;
r) RP_VERSION="$OPTARG" ;;
@@ -1576,19 +1565,7 @@ echo "# -------------------------------------------------------------------"
touch "$LOGFILES_TARBALL"
touch "$PROFILES_TARBALL"
-
-# At this point, all pre_bootstrap_0 commands have been executed. We copy the
-# resulting PATH and LD_LIBRARY_PATH, and apply that in bootstrap_2.sh, so that
-# the sub-agents start off with the same env (or at least the relevant parts of
-# it).
-#
-# This assumes that the env is actually transferable. If that assumption
-# breaks at some point, we'll have to either only transfer the incremental env
-# changes, or reconsider the approach to pre_bootstrap_x commands altogether --
-# see comment in the pre_bootstrap_0 function.
-PB1_PATH="$PATH"
-PB1_LDLB="$LD_LIBRARY_PATH"
-
+pilot_state="PMGR_ACTIVE_PENDING"
# FIXME: By now the pre_process rules are already performed.
# We should split the parsing and the execution of those.
# "bootstrap start" is here so that $PILOT_ID is known.
@@ -1596,7 +1573,6 @@ PB1_LDLB="$LD_LIBRARY_PATH"
echo 'create gtod, prof'
create_gtod
create_prof
-pilot_state="PMGR_ACTIVE_PENDING"
profile_event 'bootstrap_0_start'
# NOTE: if the virtenv path contains a symbolic link element, then distutil will
@@ -1739,24 +1715,6 @@ create_deactivate
# ------------------------------------------------------------------------------
# launch the radical agent
#
-# the actual agent script lives in PWD if it was staged -- otherwise we use it
-# from the virtenv
-# NOTE: For some reasons, I have seen installations where 'scripts' go into
-# bin/, and some where setuptools only changes them in place. For now,
-# we allow for both -- but eventually (once the agent itself is small),
-# we may want to move it to bin ourself. At that point, we probably
-# have re-implemented pip... :/
-# FIXME: the second option should use $RP_MOD_PATH, or should derive the path
-# from the imported rp modules __file__.
-PILOT_SCRIPT=`which radical-pilot-agent`
-
-if test -z "$PILOT_SCRIPT"
-then
- echo "ERROR: rp installation incomplete?"
- env_dump > env.rp.error
- exit 1
-fi
-
# after all is said and done, we should end up with a usable python version.
# Verify it
@@ -1766,15 +1724,12 @@ verify_install
# is independent of its location in the pilot VE
test -z $(which radical-gtod) || cp $(which radical-gtod) ./gtod
-AGENT_CMD="$PYTHON $PILOT_SCRIPT"
-
verify_rp_install
# TODO: (re)move this output?
echo
echo "# -------------------------------------------------------------------"
echo "# Launching radical-pilot-agent "
-echo "# CMDLINE: $AGENT_CMD"
# At this point we expand the variables in $PREBOOTSTRAP2 to pick up the
# changes made by the environment by pre_bootstrap_0.
@@ -1820,79 +1775,42 @@ else
BS_SHELL='/bin/sh'
fi
-cat > bootstrap_2.sh < bootstrap_2.sh <> services.out 2>> services.err
-else
- # start a sub-agent
- exec $AGENT_CMD "\$1" 1>>"\$1.out" 2>>"\$1.err"
-fi
+# start (sub) agent
+exec radical-pilot-agent_n "\$sid" "\$reg_addr" "\$uid" \\
+ 1>>"bootstrap_2.\$uid.out" \\
+ 2>>"bootstrap_2.\$uid.err"
EOT
chmod 0755 bootstrap_2.sh
# ------------------------------------------------------------------------------
-# add a `wait` to the services script
-test -f ./services && echo 'wait' >> ./services
-test -f ./services && chmod 0755 ./services
-
-# start the master agent instance (zero)
+# start the master agent instance (agent_0) in the bs0 environment
profile_event 'bootstrap_0_ok'
-if test -z "$CCM"; then
- ./bootstrap_2.sh 'agent.0' \
- 1>> agent.0.bootstrap_2.out \
- 2>> agent.0.bootstrap_2.err &
-else
- ccmrun ./bootstrap_2.sh 'agent.0' \
- 1>> agent.0.bootstrap_2.out \
- 2>> agent.0.bootstrap_2.err &
-fi
+
+$LAUNCHER radical-pilot-agent_0 1>>agent_0.out 2>>agent_0.err &
+
AGENT_PID=$!
pilot_state="PMGR_ACTIVE"
@@ -1901,7 +1819,6 @@ do
sleep 3
if kill -0 $AGENT_PID 2>/dev/null
then
- echo -n '.'
if test -e "./killme.signal"
then
profile_event 'killme' "`date --rfc-3339=ns | cut -c -23`"
@@ -1916,7 +1833,6 @@ do
kill -9 $AGENT_PID
fi
else
- echo
profile_event 'agent_gone' "`date --rfc-3339=ns | cut -c -23`"
echo "agent $AGENT_PID is gone"
break
@@ -1946,7 +1862,7 @@ echo "# CLEANUP: $CLEANUP"
echo "#"
profile_event 'cleanup_start'
-contains $CLEANUP 'l' && rm -r "$PILOT_SANDBOX/agent.*"
+contains $CLEANUP 'l' && rm -r "$PILOT_SANDBOX/agent_*"
contains $CLEANUP 'u' && rm -r "$PILOT_SANDBOX/task.*"
contains $CLEANUP 'v' && rm -r "$VIRTENV/" # FIXME: in what cases?
contains $CLEANUP 'e' && rm -r "$PILOT_SANDBOX/"
@@ -2037,13 +1953,6 @@ then
final_state='FAILED'
fi
-echo "# -------------------------------------------------------------------"
-echo "# push final pilot state: $SESSION_ID $PILOT_ID $final_state"
-sp=$(which radical-pilot-agent-statepush)
-test -z "$sp" && echo "statepush not found"
-test -z "$sp" || $PYTHON "$sp" agent.0.cfg "$final_state"
-
-echo
echo "# -------------------------------------------------------------------"
echo "#"
echo "# Done, exiting ($AGENT_EXITCODE)"
diff --git a/src/radical/pilot/agent/executing/base.py b/src/radical/pilot/agent/executing/base.py
index 24e1da8730..5764d61d9c 100644
--- a/src/radical/pilot/agent/executing/base.py
+++ b/src/radical/pilot/agent/executing/base.py
@@ -50,16 +50,16 @@ def create(cls, cfg, session):
if cls != AgentExecutingComponent:
raise TypeError('Factory only available to base class!')
- name = cfg['spawner']
+ name = session.rcfg.agent_spawner
from .popen import Popen
from .flux import Flux
from .sleep import Sleep
impl = {
- EXECUTING_NAME_POPEN : Popen,
- EXECUTING_NAME_FLUX : Flux,
- EXECUTING_NAME_SLEEP : Sleep,
+ EXECUTING_NAME_POPEN: Popen,
+ EXECUTING_NAME_FLUX : Flux,
+ EXECUTING_NAME_SLEEP: Sleep,
}
if name not in impl:
@@ -72,26 +72,25 @@ def create(cls, cfg, session):
#
def initialize(self):
- # self._log.debug('exec base initialize')
- # The spawner/executor needs the ResourceManager information which have
- # been collected during agent startup.
- self._rm = rpa.ResourceManager.create(self._cfg.resource_manager,
- self._cfg, self._log, self._prof)
+ rm_name = self.session.rcfg.resource_manager
+ self._rm = rpa.ResourceManager.create(rm_name,
+ self.session.cfg,
+ self.session.rcfg,
+ self._log, self._prof)
self._pwd = os.path.realpath(os.getcwd())
- self.sid = self._cfg['sid']
- self.resource = self._cfg['resource']
- self.rsbox = self._cfg['resource_sandbox']
- self.ssbox = self._cfg['session_sandbox']
- self.psbox = self._cfg['pilot_sandbox']
+ self.sid = self.session.uid
+ self.resource = self.session.cfg.resource
+ self.rsbox = self.session.cfg.resource_sandbox
+ self.ssbox = self.session.cfg.session_sandbox
+ self.psbox = self.session.cfg.pilot_sandbox
self.gtod = '$RP_PILOT_SANDBOX/gtod'
self.prof = '$RP_PILOT_SANDBOX/prof'
- # if so configured, let the Task know what to use as tmp dir
- self._task_tmp = self._cfg.get('task_tmp',
- os.environ.get('TMP', '/tmp'))
-
+ # if so configured, let the tasks know what to use as tmp dir
+ self._task_tmp = self.session.rcfg.get('task_tmp',
+ os.environ.get('TMP', '/tmp'))
if self.psbox.startswith(self.ssbox):
self.psbox = '$RP_SESSION_SANDBOX%s' % self.psbox[len(self.ssbox):]
@@ -107,7 +106,6 @@ def initialize(self):
rpc.AGENT_STAGING_OUTPUT_QUEUE)
self.register_publisher (rpc.AGENT_UNSCHEDULE_PUBSUB)
- self.register_subscriber(rpc.CONTROL_PUBSUB, self.control_cb)
self._to_tasks = list()
self._to_lock = mt.Lock()
@@ -132,14 +130,13 @@ def control_cb(self, topic, msg):
cmd = msg['cmd']
arg = msg['arg']
+ # FIXME RPC: already handled in the component base class
if cmd == 'cancel_tasks':
self._log.info('cancel_tasks command (%s)', arg)
for tid in arg['uids']:
self.cancel_task(tid)
- return True
-
# --------------------------------------------------------------------------
#
diff --git a/src/radical/pilot/agent/executing/flux.py b/src/radical/pilot/agent/executing/flux.py
index be7ba0c31b..56144edbc3 100644
--- a/src/radical/pilot/agent/executing/flux.py
+++ b/src/radical/pilot/agent/executing/flux.py
@@ -56,12 +56,11 @@ def initialize(self):
}
# we get an instance of the resource manager (init from registry info)
- self._rm = ResourceManager.create(name=self._cfg.resource_manager,
- cfg=self._cfg, log=self._log,
- prof=self._prof)
-
- # assert self._rm.from_info
-
+ rm_name = self.session.rcfg.resource_manager
+ self._rm = ResourceManager.create(rm_name,
+ self.session.cfg,
+ self.session.rcfg,
+ self._log, self._prof)
# thread termination signal
self._term = mt.Event()
@@ -120,10 +119,11 @@ def work(self, tasks):
#
def _listen(self):
- lm_cfg = self._cfg.resource_cfg.launch_methods.get('FLUX')
- lm_cfg['pid'] = self._cfg.pid
- lm_cfg['reg_addr'] = self._cfg.reg_addr
- lm = LaunchMethod.create('FLUX', lm_cfg, self._cfg,
+ lm_cfg = self.session.rcfg.launch_methods.get('FLUX')
+ lm_cfg['pid'] = self.session.cfg.pid
+ lm_cfg['reg_addr'] = self.session.cfg.reg_addr
+ lm = LaunchMethod.create('FLUX', lm_cfg,
+ self.session.cfg,
self._log, self._prof)
flux_handle = None
try:
diff --git a/src/radical/pilot/agent/executing/popen.py b/src/radical/pilot/agent/executing/popen.py
index ec7c5fce71..ce0e0fe9c8 100644
--- a/src/radical/pilot/agent/executing/popen.py
+++ b/src/radical/pilot/agent/executing/popen.py
@@ -19,8 +19,6 @@
from .base import AgentExecutingComponent
-from ...task_description import RAPTOR_MASTER, RAPTOR_WORKER
-
# ------------------------------------------------------------------------------
# ensure tasks are killed on termination
@@ -66,7 +64,7 @@ def initialize(self):
self._watch_queue = queue.Queue()
- self._pid = self._cfg['pid']
+ self._pid = self.session.cfg.pid
# run watcher thread
self._watcher = mt.Thread(target=self._watch)
@@ -227,9 +225,6 @@ def _handle_task(self, task):
ru.rec_makedir(sbox)
- if td['mode'] in [RAPTOR_MASTER, RAPTOR_WORKER]:
- ru.write_json('%s/%s.json' % (sbox, tid), td)
-
with ru.ru_open('%s/%s' % (sbox, launch_script), 'w') as fout:
tmp = ''
@@ -345,10 +340,10 @@ def _handle_task(self, task):
self._log.info('Launching task %s via %s in %s', tid, cmdline, sbox)
_launch_out_h = ru.ru_open('%s/%s.launch.out' % (sbox, tid), 'w')
+
# `start_new_session=True` is default, which enables decoupling
# from the parent process group (part of the task cancellation)
- _start_new_session = self._cfg['resource_cfg'].\
- get('new_session_per_task', True)
+ _start_new_session = self.session.rcfg.new_session_per_task or False
self._prof.prof('task_run_start', uid=tid)
task['proc'] = sp.Popen(args = cmdline,
@@ -438,6 +433,9 @@ def _check_running(self, to_watch, to_cancel):
# poll subprocess object
exit_code = task['proc'].poll()
+ tasks_to_advance = list()
+ tasks_to_cancel = list()
+
if exit_code is None:
# process is still running - cancel if needed
@@ -469,9 +467,7 @@ def _check_running(self, to_watch, to_cancel):
self._prof.prof('task_run_cancel_stop', uid=tid)
self._prof.prof('unschedule_start', uid=tid)
- self.publish(rpc.AGENT_UNSCHEDULE_PUBSUB, task)
- self.advance_tasks(task, rps.CANCELED, publish=True,
- push=False)
+ tasks_to_cancel.append(task)
else:
@@ -491,9 +487,9 @@ def _check_running(self, to_watch, to_cancel):
if tid in to_cancel:
to_cancel.remove(tid)
del task['proc'] # proc is not json serializable
+ tasks_to_advance.append(task)
self._prof.prof('unschedule_start', uid=tid)
- self.publish(rpc.AGENT_UNSCHEDULE_PUBSUB, task)
if exit_code != 0:
# task failed - fail after staging output
@@ -508,8 +504,15 @@ def _check_running(self, to_watch, to_cancel):
# stdout/stderr
task['target_state'] = rps.DONE
- self.advance_tasks(task, rps.AGENT_STAGING_OUTPUT_PENDING,
- publish=True, push=True)
+ self.publish(rpc.AGENT_UNSCHEDULE_PUBSUB,
+ tasks_to_cancel + tasks_to_advance)
+
+ if tasks_to_cancel:
+ self.advance(tasks_to_cancel, rps.CANCELED,
+ publish=True, push=False)
+ if tasks_to_advance:
+ self.advance(tasks_to_advance, rps.AGENT_STAGING_OUTPUT_PENDING,
+ publish=True, push=True)
return action
@@ -540,12 +543,19 @@ def _get_rp_funcs(self):
def _get_rp_env(self, task):
tid = task['uid']
+ td = task['description']
name = task.get('name') or tid
sbox = os.path.realpath(task['task_sandbox_path'])
if sbox.startswith(self._pwd):
sbox = '$RP_PILOT_SANDBOX%s' % sbox[len(self._pwd):]
+ gpr = td['gpus_per_rank']
+ if int(gpr) == gpr:
+ gpr = '%d' % gpr
+ else:
+ gpr = '%f' % gpr
+
ret = '\n'
ret += 'export RP_TASK_ID="%s"\n' % tid
ret += 'export RP_TASK_NAME="%s"\n' % name
@@ -556,11 +566,14 @@ def _get_rp_env(self, task):
ret += 'export RP_SESSION_SANDBOX="%s"\n' % self.ssbox
ret += 'export RP_PILOT_SANDBOX="%s"\n' % self.psbox
ret += 'export RP_TASK_SANDBOX="%s"\n' % sbox
+ ret += 'export RP_REGISTRY_ADDRESS="%s"\n' % self.session.reg_addr
+ ret += 'export RP_CORES_PER_RANK=%d\n' % td['cores_per_rank']
+ ret += 'export RP_GPUS_PER_RANK=%s\n' % gpr
+
# FIXME AM
# ret += 'export RP_LFS="%s"\n' % self.lfs
ret += 'export RP_GTOD="%s"\n' % self.gtod
ret += 'export RP_PROF="%s"\n' % self.prof
- # ret += 'export RP_REGISTRY_URL="%s"\n' % self.reg_addr
if self._prof.enabled:
ret += 'export RP_PROF_TGT="%s/%s.prof"\n' % (sbox, tid)
@@ -695,7 +708,7 @@ def _extend_pre_exec(self, td, ranks=None):
td['pre_exec'].append(rank_env)
# pre-defined `pre_exec` per platform configuration
- td['pre_exec'].extend(ru.as_list(self._cfg.get('task_pre_exec')))
+ td['pre_exec'].extend(ru.as_list(self.session.rcfg.get('task_pre_exec')))
# --------------------------------------------------------------------------
diff --git a/src/radical/pilot/agent/executing/sleep.py b/src/radical/pilot/agent/executing/sleep.py
index 5358dddb4c..798e6351fd 100644
--- a/src/radical/pilot/agent/executing/sleep.py
+++ b/src/radical/pilot/agent/executing/sleep.py
@@ -39,14 +39,14 @@ def initialize(self):
self.register_output(rps.AGENT_STAGING_OUTPUT_PENDING,
rpc.AGENT_STAGING_OUTPUT_QUEUE)
- self.register_publisher (rpc.AGENT_UNSCHEDULE_PUBSUB)
+ self.register_publisher(rpc.AGENT_UNSCHEDULE_PUBSUB)
self._terminate = mt.Event()
- self._tasks_lock = ru.RLock()
+ self._tasks_lock = mt.RLock()
self._tasks = list()
- self._delay = 0.1
+ self._delay = 1.0
- self._watcher = mt.Thread(target=self._timed)
+ self._watcher = mt.Thread(target=self._collect)
self._watcher.daemon = True
self._watcher.start()
@@ -112,30 +112,57 @@ def _handle_task(self, task):
# --------------------------------------------------------------------------
#
- def _timed(self):
+ def _collect(self):
while not self._terminate.is_set():
- time.sleep(self._delay)
+ to_finish = list()
+ to_continue = list()
+ now = time.time()
with self._tasks_lock:
- now = time.time()
- to_finish = [t for t in self._tasks if t['to_finish'] <= now]
- self._tasks = [t for t in self._tasks if t['to_finish'] > now]
+
+ for task in self._tasks:
+ if task['deadline'] <= now: to_finish.append(task)
+ else : to_continue.append(task)
+
+ self._tasks = to_continue
+
+ if not to_finish:
+ time.sleep(self._delay)
+ continue
for task in to_finish:
uid = task['uid']
task['target_state'] = 'DONE'
+
self._prof.prof('rank_stop', uid=uid)
self._prof.prof('exec_stop', uid=uid)
self._prof.prof('launch_stop', uid=uid)
self._prof.prof('task_run_stop', uid=uid)
self._prof.prof('unschedule_start', uid=uid)
- self.publish(rpc.AGENT_UNSCHEDULE_PUBSUB, task)
+ self._log.debug('collected : %d', len(to_finish))
+
+ self.publish(rpc.AGENT_UNSCHEDULE_PUBSUB, to_finish)
self.advance_tasks(to_finish, rps.AGENT_STAGING_OUTPUT_PENDING,
publish=True, push=True)
+ # --------------------------------------------------------------------------
+ #
+ def control_cb(self, topic, msg):
+
+ self._log.info('control_cb [%s]: %s', topic, msg)
+
+ cmd = msg['cmd']
+
+ # FIXME RPC: already handled in the component base class
+ if cmd == 'cancel_tasks':
+
+ # FIXME: clarify how to cancel tasks
+ pass
+
+
# ------------------------------------------------------------------------------
diff --git a/src/radical/pilot/agent/launch_method/mpirun.py b/src/radical/pilot/agent/launch_method/mpirun.py
index dfb51b9630..ffb349ce94 100644
--- a/src/radical/pilot/agent/launch_method/mpirun.py
+++ b/src/radical/pilot/agent/launch_method/mpirun.py
@@ -35,7 +35,7 @@ def _init_from_scratch(self, env, env_sh):
components (including Raptor and other task overlays) can use them to
launch tasks.
- The first use (likely in `agent.0`) will call this initializer to
+ The first use (likely in `agent_0`) will call this initializer to
inspect LM properties. Later uses will be able to use the information
gathered and should re-initialize via `_init_from_info()`, using the
info dict returned here.
diff --git a/src/radical/pilot/agent/resource_manager/base.py b/src/radical/pilot/agent/resource_manager/base.py
index 3ed6bcee5d..dcb9c8d432 100644
--- a/src/radical/pilot/agent/resource_manager/base.py
+++ b/src/radical/pilot/agent/resource_manager/base.py
@@ -61,7 +61,7 @@ class RMInfo(ru.TypedDict):
'mem_per_node' : int, # memory per node (MB)
'details' : {None: None}, # dict of launch method info
- 'lm_info' : {str: None}, # dict of launch method info
+ 'launch_methods' : {str: None}, # dict of launch method info
}
_defaults = {
@@ -71,6 +71,7 @@ class RMInfo(ru.TypedDict):
'threads_per_core' : 1,
'gpus_per_node' : 0,
'threads_per_gpu' : 1,
+ 'launch_methods' : {}
}
@@ -123,10 +124,11 @@ class ResourceManager(object):
# --------------------------------------------------------------------------
#
- def __init__(self, cfg, log, prof):
+ def __init__(self, cfg, rcfg, log, prof):
self.name = type(self).__name__
self._cfg = cfg
+ self._rcfg = rcfg
self._log = log
self._prof = prof
@@ -151,12 +153,12 @@ def __init__(self, cfg, log, prof):
# have a valid info - store in registry and complete initialization
reg.put('rm.%s' % self.name.lower(), rm_info.as_dict())
- # set up launch methods even when initialized from registry info
- self._prepare_launch_methods(rm_info)
-
reg.close()
self._set_info(rm_info)
+ # set up launch methods even when initialized from registry info
+ self._prepare_launch_methods()
+
# --------------------------------------------------------------------------
#
@@ -219,11 +221,8 @@ def init_from_scratch(self):
rm_info.threads_per_gpu = 1
rm_info.mem_per_gpu = None
-
- rcfg = self._cfg.resource_cfg
- rm_info.mem_per_node = rcfg.mem_per_node or 0
-
- system_architecture = rcfg.get('system_architecture', {})
+ rm_info.mem_per_node = self._rcfg.mem_per_node or 0
+ system_architecture = self._rcfg.get('system_architecture', {})
rm_info.threads_per_core = int(os.environ.get('RADICAL_SMT') or
system_architecture.get('smt', 1))
@@ -311,31 +310,33 @@ def init_from_scratch(self):
if not rm_info.node_list:
raise RuntimeError('ResourceManager has no nodes left to run tasks')
+ # add launch method information to rm_info
+ rm_info.launch_methods = self._rcfg.launch_methods
+
return rm_info
# --------------------------------------------------------------------------
#
- def _prepare_launch_methods(self, rm_info):
-
- launch_methods = self._cfg.resource_cfg.launch_methods
+ def _prepare_launch_methods(self):
+ launch_methods = self._rm_info.launch_methods
self._launchers = {}
self._launch_order = launch_methods.get('order') or list(launch_methods)
for lm_name in list(self._launch_order):
- lm_cfg = launch_methods[lm_name]
+ lm_cfg = ru.Config(launch_methods[lm_name])
try:
self._log.debug('prepare lm %s', lm_name)
- lm_cfg['pid'] = self._cfg.pid
- lm_cfg['reg_addr'] = self._cfg.reg_addr
- lm_cfg['resource'] = self._cfg.resource
+ lm_cfg.pid = self._cfg.pid
+ lm_cfg.reg_addr = self._cfg.reg_addr
+ lm_cfg.resource = self._cfg.resource
self._launchers[lm_name] = rpa.LaunchMethod.create(
- lm_name, lm_cfg, rm_info, self._log, self._prof)
+ lm_name, lm_cfg, self._rm_info, self._log, self._prof)
- except:
+ except Exception as e:
self._log.exception('skip lm %s', lm_name)
self._launch_order.remove(lm_name)
@@ -365,7 +366,7 @@ def get_partitions(self):
# ResourceManager.
#
@classmethod
- def create(cls, name, cfg, log, prof):
+ def create(cls, name, cfg, rcfg, log, prof):
from .ccm import CCM
from .fork import Fork
@@ -396,7 +397,7 @@ def create(cls, name, cfg, log, prof):
if name not in impl:
raise RuntimeError('ResourceManager %s unknown' % name)
- return impl[name](cfg, log, prof)
+ return impl[name](cfg, rcfg, log, prof)
diff --git a/src/radical/pilot/agent/resource_manager/fork.py b/src/radical/pilot/agent/resource_manager/fork.py
index 92c1f688d5..9b068403fc 100644
--- a/src/radical/pilot/agent/resource_manager/fork.py
+++ b/src/radical/pilot/agent/resource_manager/fork.py
@@ -20,7 +20,7 @@ def _init_from_scratch(self, rm_info: RMInfo) -> RMInfo:
if not rm_info.cores_per_node:
rm_info.cores_per_node = detected_cores
- if self._cfg.resource_cfg.fake_resources:
+ if self._rcfg.fake_resources:
self._log.info(
'virtual resource with %d cores per node (%d detected cores)' %
(rm_info.cores_per_node, detected_cores))
diff --git a/src/radical/pilot/agent/scheduler/base.py b/src/radical/pilot/agent/scheduler/base.py
index 18a74dcbfd..5faef6f2db 100644
--- a/src/radical/pilot/agent/scheduler/base.py
+++ b/src/radical/pilot/agent/scheduler/base.py
@@ -216,8 +216,11 @@ def initialize(self):
# The scheduler needs the ResourceManager information which have been
# collected during agent startup.
- self._rm = ResourceManager.create(self._cfg.resource_manager,
- self._cfg, self._log, self._prof)
+ rm_name = self.session.rcfg.resource_manager
+ self._rm = ResourceManager.create(rm_name,
+ self.session.cfg,
+ self.session.rcfg,
+ self._log, self._prof)
self._partitions = self._rm.get_partitions() # {plabel : [node_ids]}
@@ -256,6 +259,7 @@ def initialize(self):
self.register_subscriber(rpc.AGENT_UNSCHEDULE_PUBSUB, self.unschedule_cb)
# start a process to host the actual scheduling algorithm
+ self._scheduler_process = False
self._p = mp.Process(target=self._schedule_tasks)
self._p.daemon = True
self._p.start()
@@ -286,7 +290,7 @@ def create(cls, cfg, session):
if cls != AgentSchedulingComponent:
raise TypeError("Scheduler Factory only available to base class!")
- name = cfg['scheduler']
+ name = session.rcfg.agent_scheduler
from .continuous_ordered import ContinuousOrdered
from .continuous_colo import ContinuousColo
@@ -313,16 +317,22 @@ def create(cls, cfg, session):
# --------------------------------------------------------------------------
#
- def _control_cb(self, topic, msg):
+ def control_cb(self, topic, msg):
'''
listen on the control channel for raptor queue registration commands
'''
+ print('----- b', msg)
+
+ # only the scheduler process listens for control messages
+ if not self._scheduler_process:
+ return
cmd = msg['cmd']
arg = msg['arg']
if cmd == 'register_named_env':
+
env_name = arg['env_name']
self._named_envs.append(env_name)
@@ -383,12 +393,14 @@ def _control_cb(self, topic, msg):
self.advance(tasks, state=rps.FAILED,
publish=True, push=False)
+ # FIXME: RPC: this is caught in the base class handler already
elif cmd == 'cancel_tasks':
uids = arg['uids']
to_cancel = list()
with self._lock:
for uid in uids:
+ print('---------- cancel', uid)
if uid in self._waitpool:
to_cancel.append(self._waitpool[uid])
del self._waitpool[uid]
@@ -410,8 +422,6 @@ def _control_cb(self, topic, msg):
else:
self._log.debug('command ignored: [%s]', cmd)
- return True
-
# --------------------------------------------------------------------------
#
@@ -593,6 +603,17 @@ def _schedule_tasks(self):
tasks.
'''
+ self._scheduler_process = True
+
+ # ZMQ endpoints will not have survived the fork. Specifically the
+ # registry client of the component base class will have to reconnect.
+ # Note that `self._reg` of the base class is a *pointer* to the sesison
+ # registry.
+ #
+ # FIXME: should be moved into a post-fork hook of the session
+ #
+ self._reg = ru.zmq.RegistryClient(url=self.session.cfg.reg_addr)
+
# FIXME: the component does not clean out subscribers after fork :-/
self._subscribers = dict()
@@ -639,13 +660,13 @@ def _schedule_tasks(self):
self._raptor_tasks = dict() # raptor_master_id : [task]
self._raptor_lock = mt.Lock() # lock for the above
- # subscribe to control messages, e.g., to register raptor queues
- self.register_subscriber(rpc.CONTROL_PUBSUB, self._control_cb)
-
# register task output channels
self.register_output(rps.AGENT_EXECUTING_PENDING,
rpc.AGENT_EXECUTING_QUEUE)
+ # re-register the control callback in this subprocess
+ self.register_subscriber(rpc.CONTROL_PUBSUB, self._control_cb)
+
self._publishers = dict()
self.register_publisher(rpc.STATE_PUBSUB)
@@ -938,8 +959,8 @@ def _unschedule_completed(self):
# in a max added latency of about 0.1 second, which is one order of
# magnitude above our noise level again and thus acceptable (tm).
while not self._term.is_set():
- task = self._queue_unsched.get(timeout=0.01)
- to_unschedule.append(task)
+ tasks = self._queue_unsched.get(timeout=0.01)
+ to_unschedule += ru.as_list(tasks)
if len(to_unschedule) > 512:
break
diff --git a/src/radical/pilot/agent/scheduler/continuous.py b/src/radical/pilot/agent/scheduler/continuous.py
index 532e9b608d..75d592d283 100644
--- a/src/radical/pilot/agent/scheduler/continuous.py
+++ b/src/radical/pilot/agent/scheduler/continuous.py
@@ -2,9 +2,12 @@
__copyright__ = 'Copyright 2013-2021, The RADICAL-Cybertools Team'
__license__ = 'MIT'
-import math as m
import pprint
+import math as m
+
+import radical.utils as ru
+
from ... import constants as rpc
from .base import AgentSchedulingComponent
@@ -93,7 +96,7 @@ def _configure(self):
# this option is set. This implementation is not optimized for the
# scattered mode! The default is 'False'.
#
- self._scattered = self._cfg.get('scattered', False)
+ self._scattered = self.session.rcfg.get('scattered', False)
# --------------------------------------------------------------------------
@@ -113,7 +116,7 @@ def _iterate_nodes(self):
# --------------------------------------------------------------------------
#
- def unschedule_task(self, task):
+ def unschedule_task(self, tasks):
'''
This method is called when previously aquired resources are not needed
anymore. `slots` are the resource slots as previously returned by
@@ -121,7 +124,8 @@ def unschedule_task(self, task):
'''
# reflect the request in the nodelist state (set to `FREE`)
- self._change_slot_states(task['slots'], rpc.FREE)
+ for task in ru.as_list(tasks):
+ self._change_slot_states(task['slots'], rpc.FREE)
# --------------------------------------------------------------------------
@@ -159,6 +163,8 @@ def _find_resources(self, node, find_slots, ranks_per_slot, cores_per_slot,
thread count and using physical core IDs for process placement?
'''
+ # self._log.debug('find on %s: %s * [%s, %s]', node['uid'], )
+
# check if the node can host the request
free_cores = node['cores'].count(rpc.FREE)
free_gpus = node['gpus'].count(rpc.FREE)
@@ -319,22 +325,31 @@ def schedule_task(self, task):
'too much mem per proc %s' % mem_per_slot
# check what resource type limits teh number of slots per node
+ tmp = list()
slots_per_node = int(m.floor(cores_per_node / cores_per_slot))
+ tmp.append([cores_per_node, cores_per_slot, slots_per_node])
if gpus_per_slot:
slots_per_node = min(slots_per_node,
int(m.floor(gpus_per_node / gpus_per_slot)))
+ tmp.append([gpus_per_node, gpus_per_slot, slots_per_node])
if lfs_per_slot:
slots_per_node = min(slots_per_node,
int(m.floor(lfs_per_node / lfs_per_slot)))
+ tmp.append([lfs_per_node, lfs_per_slot, slots_per_node])
if mem_per_slot:
slots_per_node = min(slots_per_node,
int(m.floor(mem_per_node / mem_per_slot)))
+ tmp.append([mem_per_node, mem_per_slot, slots_per_node])
if not mpi and req_slots > slots_per_node:
- raise ValueError('non-mpi task does not fit on a single node')
+ raise ValueError('non-mpi task does not fit on a single node:'
+ '%s * %s:%s > %s:%s -- %s > %s [%s %s] %s' % (req_slots,
+ cores_per_slot, gpus_per_slot,
+ cores_per_node, gpus_per_node, req_slots,
+ slots_per_node, cores_per_slot, gpus_per_slot, tmp))
# set conditions to find the first matching node
is_first = True
diff --git a/src/radical/pilot/agent/scheduler/flux.py b/src/radical/pilot/agent/scheduler/flux.py
index e600ec2f5f..fef146d8aa 100644
--- a/src/radical/pilot/agent/scheduler/flux.py
+++ b/src/radical/pilot/agent/scheduler/flux.py
@@ -69,14 +69,14 @@ def _configure(self):
# performed in retrospect by the executor, based on the scheduling and
# execution events collected from Flux.
qname = rpc.AGENT_EXECUTING_QUEUE
- fname = '%s/%s.cfg' % (self._cfg.path, qname)
- cfg = ru.read_json(fname)
+ cfg = self._reg['bridges.%s' % qname]
self._q = ru.zmq.Putter(qname, cfg['put'])
- lm_cfg = self._cfg.resource_cfg.launch_methods.get('FLUX')
- lm_cfg['pid'] = self._cfg.pid
- lm_cfg['reg_addr'] = self._cfg.reg_addr
- self._lm = LaunchMethod.create('FLUX', lm_cfg, self._cfg,
+ lm_cfg = self.session.rcfg.launch_methods.get('FLUX')
+ lm_cfg['pid'] = self.session.cfg.pid
+ lm_cfg['reg_addr'] = self.session.cfg.reg_addr
+ self._lm = LaunchMethod.create('FLUX', lm_cfg,
+ self.session.cfg,
self._log, self._prof)
diff --git a/src/radical/pilot/agent/scheduler/hombre.py b/src/radical/pilot/agent/scheduler/hombre.py
index 37f4394c3e..98749ae225 100644
--- a/src/radical/pilot/agent/scheduler/hombre.py
+++ b/src/radical/pilot/agent/scheduler/hombre.py
@@ -49,7 +49,7 @@ def _configure(self):
# `oversubscribe` is set to False (which is the default for now),
# we'll prevent that behavior by allocating one additional CPU core
# for each set of requested GPU processes.
- self._oversubscribe = self._cfg.get('oversubscribe', True)
+ self._oversubscribe = self.session.rcfg.get('oversubscribe', True)
if not self._oversubscribe:
raise ValueError('HOMBRE needs oversubscription enabled')
diff --git a/src/radical/pilot/agent/staging_output/default.py b/src/radical/pilot/agent/staging_output/default.py
index cdf8d53f7d..37e429cd94 100644
--- a/src/radical/pilot/agent/staging_output/default.py
+++ b/src/radical/pilot/agent/staging_output/default.py
@@ -49,8 +49,9 @@ def initialize(self):
self.register_input(rps.AGENT_STAGING_OUTPUT_PENDING,
rpc.AGENT_STAGING_OUTPUT_QUEUE, self.work)
- # we don't need an output queue -- tasks are picked up via mongodb
- self.register_output(rps.TMGR_STAGING_OUTPUT_PENDING, None) # drop
+ self.register_output(rps.TMGR_STAGING_OUTPUT_PENDING,
+ rpc.AGENT_COLLECTING_QUEUE)
+
# --------------------------------------------------------------------------
@@ -359,7 +360,7 @@ def _handle_task_staging(self, task, actionables):
# all agent staging is done -- pass on to tmgr output staging
self.advance(task, rps.TMGR_STAGING_OUTPUT_PENDING,
- publish=True, push=False)
+ publish=True, push=True)
# ------------------------------------------------------------------------------
diff --git a/src/radical/pilot/configs/agent_debug_sa.json b/src/radical/pilot/configs/agent_debug_sa.json
new file mode 100644
index 0000000000..dac7a4a11d
--- /dev/null
+++ b/src/radical/pilot/configs/agent_debug_sa.json
@@ -0,0 +1,99 @@
+
+{
+ "staging_area" : "staging_area",
+ "staging_schema" : "staging",
+ "max_io_loglength" : 1024,
+
+ "bulk_time" : 10.0,
+ "bulk_size" : 4096,
+
+ "heartbeat" : {
+ "interval" : 10.0,
+ "timeout" : 30.0
+ },
+
+ "target" : "local",
+ "bridges" : {
+ "agent_staging_input_queue" : { "kind" : "queue",
+ "log_level" : "error",
+ "stall_hwm" : 1,
+ "bulk_size" : 1024},
+ "agent_scheduling_queue" : { "kind" : "queue",
+ "log_level" : "error",
+ "stall_hwm" : 1,
+ "bulk_size" : 1024},
+ "agent_executing_queue" : { "kind" : "queue",
+ "log_level" : "error",
+ "stall_hwm" : 1,
+ "bulk_size" : 1024},
+ "agent_staging_output_queue" : { "kind" : "queue",
+ "log_level" : "error",
+ "stall_hwm" : 1,
+ "bulk_size" : 1024},
+
+ "agent_unschedule_pubsub" : { "kind" : "pubsub",
+ "log_level" : "error",
+ "stall_hwm" : 1,
+ "bulk_size" : 1024},
+ "agent_schedule_pubsub" : { "kind" : "pubsub",
+ "log_level" : "error",
+ "stall_hwm" : 1,
+ "bulk_size" : 1024},
+
+ "control_pubsub" : { "kind" : "pubsub",
+ "log_level" : "error",
+ "stall_hwm" : 1,
+ "bulk_size" : 1024},
+ "state_pubsub" : { "kind" : "pubsub",
+ "log_level" : "error",
+ "stall_hwm" : 1,
+ "bulk_size" : 1024},
+ "log_pubsub" : { "kind" : "pubsub",
+ "log_level" : "error",
+ "stall_hwm" : 1,
+ "bulk_size" : 1024}
+ },
+
+ "components" : {
+ "agent_staging_input" : {"count" : 4},
+ "agent_scheduling" : {"count" : 1},
+ "agent_executing" : {"count" : 4},
+ "agent_staging_output" : {"count" : 4}
+ # },
+ #
+ # "agents": {
+ # "agent_1": {
+ # "target": "node",
+ # "components": {
+ # "agent_staging_input" : {"count" : 1},
+ # "agent_executing" : {"count" : 1},
+ # "agent_staging_output" : {"count" : 1}
+ # }
+ # },
+ # "agent_2": {
+ # "target": "node",
+ # "components": {
+ # "agent_staging_input" : {"count" : 1},
+ # "agent_executing" : {"count" : 1},
+ # "agent_staging_output" : {"count" : 1}
+ # }
+ # },
+ # "agent_3": {
+ # "target": "node",
+ # "components": {
+ # "agent_staging_input" : {"count" : 1},
+ # "agent_executing" : {"count" : 1},
+ # "agent_staging_output" : {"count" : 1}
+ # }
+ # },
+ # "agent_4": {
+ # "target": "node",
+ # "components": {
+ # "agent_staging_input" : {"count" : 1},
+ # "agent_executing" : {"count" : 1},
+ # "agent_staging_output" : {"count" : 1}
+ # }
+ # }
+ }
+}
+
diff --git a/src/radical/pilot/configs/agent_default.json b/src/radical/pilot/configs/agent_default.json
index af607c02fe..5384ae4e94 100644
--- a/src/radical/pilot/configs/agent_default.json
+++ b/src/radical/pilot/configs/agent_default.json
@@ -4,25 +4,13 @@
# a functional pilot agent, without any component redundency.
{
- # max number of updates to put into a db bulk
- "bulk_collection_size" : 1024,
-
- # max time period to collect db notifications into bulks (seconds)
- "bulk_collection_time" : 1.0,
-
- # time to sleep between database polls (seconds)
- "db_poll_sleeptime" : 2.0,
-
- # agent.0 must always have target 'local' at this point
+ # agent_0 must always have target 'local' at this point
# mode 'shared' : local node is also used for CUs
# mode 'reserved' : local node is reserved for the agent
# FIXME: mode is unused
"target" : "local",
"mode" : "shared",
- "bulk_time" : 1.0,
- "bulk_size" : 1024,
-
"heartbeat" : {
"interval" : 1.0,
"timeout" : 60.0
@@ -35,45 +23,24 @@
# stall_hwm and batch_size is 1 (no stalling, no bulking).
#
"bridges" : {
- "agent_staging_input_queue" : { "kind" : "queue",
- "log_level" : "error",
- "stall_hwm" : 0,
- "bulk_size" : 0},
- "agent_scheduling_queue" : { "kind" : "queue",
- "log_level" : "error",
- "stall_hwm" : 0,
- "bulk_size" : 0},
- "agent_executing_queue" : { "kind" : "queue",
- "log_level" : "error",
- "stall_hwm" : 0,
- "bulk_size" : 0},
- "agent_staging_output_queue" : { "kind" : "queue",
- "log_level" : "error",
- "stall_hwm" : 0,
- "bulk_size" : 0},
+ "agent_staging_input_queue" : {"kind": "queue", "log_lvl":"error"},
+ "agent_scheduling_queue" : {"kind": "queue", "log_lvl":"error"},
+ "agent_executing_queue" : {"kind": "queue", "log_lvl":"error"},
+ "agent_staging_output_queue" : {"kind": "queue", "log_lvl":"error"},
+ "agent_collecting_queue" : {"kind": "queue", "log_lvl":"error"},
+
+ "raptor_scheduling_queue" : {"kind": "queue", "log_lvl":"error"},
- "raptor_scheduling_queue" : { "kind" : "queue",
- "log_level" : "error",
- "stall_hwm" : 0,
- "bulk_size" : 1},
+ "agent_unschedule_pubsub" : {"kind": "pubsub", "log_lvl":"error"},
+ "agent_schedule_pubsub" : {"kind": "pubsub", "log_lvl":"error"},
- "agent_unschedule_pubsub" : { "kind" : "pubsub",
- "log_level" : "error"},
- "agent_schedule_pubsub" : { "kind" : "pubsub",
- "log_level" : "error"},
+ "control_pubsub" : {"kind": "pubsub", "log_lvl":"error"},
+ "state_pubsub" : {"kind": "pubsub", "log_lvl":"error"}
- "control_pubsub" : { "kind" : "pubsub",
- "log_level" : "error"},
- "state_pubsub" : { "kind" : "pubsub",
- "log_level" : "error"}
- # "log_pubsub" : { "kind" : "pubsub",
- # "log_level" : "error"}
+ # "log_pubsub" : {"kind": "pubsub", "log_lvl":"error"}
},
"components" : {
- # the update worker must live in agent.0, since only that agent is
- # sure to have connectivity toward the DB.
- "update" : {"count" : 1},
"agent_staging_input" : {"count" : 1},
"agent_scheduling" : {"count" : 1},
"agent_executing" : {"count" : 1},
diff --git a/src/radical/pilot/configs/agent_default_sa.json b/src/radical/pilot/configs/agent_default_sa.json
index b851f919af..9d4fa39dfd 100644
--- a/src/radical/pilot/configs/agent_default_sa.json
+++ b/src/radical/pilot/configs/agent_default_sa.json
@@ -4,25 +4,13 @@
# a functional pilot agent, without any component redundency.
{
- # max number of updates to put into a db bulk
- "bulk_collection_size" : 1024,
-
- # max time period to collect db notifications into bulks (seconds)
- "bulk_collection_time" : 1.0,
-
- # time to sleep between database polls (seconds)
- "db_poll_sleeptime" : 2.0,
-
- # agent.0 must always have target 'local' at this point
+ # agent_0 must always have target 'local' at this point
# mode 'shared' : local node is also used for CUs
# mode 'reserved' : local node is reserved for the agent
# FIXME: mode is unused
"target" : "local",
"mode" : "shared",
- "bulk_time" : 1.0,
- "bulk_size" : 1024,
-
"heartbeat" : {
"interval" : 1.0,
"timeout" : 60.0
@@ -71,9 +59,6 @@
},
"components" : {
- # the update worker must live in agent.0, since only that agent is
- # sure to have connectivity toward the DB.
- "update" : {"count" : 1},
"agent_staging_input" : {"count" : 1},
"agent_scheduling" : {"count" : 1},
# "agent_executing" : {"count" : 1},
diff --git a/src/radical/pilot/configs/pmgr_default.json b/src/radical/pilot/configs/pmgr_default.json
index 472f8f2afe..804e96a6d4 100644
--- a/src/radical/pilot/configs/pmgr_default.json
+++ b/src/radical/pilot/configs/pmgr_default.json
@@ -4,18 +4,21 @@
# a functional pilot manager.
{
# time to sleep between database polls (seconds)
+
+ "heartbeat" : {
+ "interval" : 3.0,
+ "timeout" : 10.0
+ },
+
"db_poll_sleeptime" : 1.0,
"bridges" : {
- "pmgr_launching_queue" : {"kind" : "queue",
- "log_level" : "error",
- "stall_hwm" : 0,
- "bulk_size" : 0}
+ "pmgr_launching_queue" : {"kind": "queue"}
},
"components" : {
# how many instances of the respective components should be started
- "pmgr_launching" : {"count" : 1}
+ "pmgr_launching" : {"count": 1}
}
}
diff --git a/src/radical/pilot/configs/resource_access.json b/src/radical/pilot/configs/resource_access.json
index 8c0cf9250c..93c0bd6314 100644
--- a/src/radical/pilot/configs/resource_access.json
+++ b/src/radical/pilot/configs/resource_access.json
@@ -3,16 +3,16 @@
"expanse": {
"description" : "(https://www.sdsc.edu/support/user_guides/expanse.html).",
"notes" : "Always set the ``project`` attribute in the PilotDescription.",
- "schemas" : ["local", "ssh"],
- "local" :
- {
- "job_manager_endpoint" : "slurm://expanse.sdsc.xsede.org",
- "filesystem_endpoint" : "file://expanse.sdsc.xsede.org"
- },
- "ssh" :
- {
- "job_manager_endpoint" : "slurm+ssh://expanse.sdsc.xsede.org",
- "filesystem_endpoint" : "sftp://expanse.sdsc.xsede.org"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint" : "slurm://expanse.sdsc.xsede.org",
+ "filesystem_endpoint" : "file://expanse.sdsc.xsede.org"
+ },
+ "ssh" : {
+ "job_manager_endpoint" : "slurm+ssh://expanse.sdsc.xsede.org",
+ "filesystem_endpoint" : "sftp://expanse.sdsc.xsede.org"
+ }
},
"default_remote_workdir" : "/expanse/lustre/scratch/$USER/temp_project",
"default_queue" : "compute",
@@ -41,22 +41,21 @@
"stampede2_ssh": {
"description" : "The ACCESS 'Stampede' cluster at TACC (https://docs.tacc.utexas.edu/hpc/stampede2/).",
"notes" : "Always set the ``project`` attribute in the PilotDescription or the pilot will fail.",
- "schemas" : ["local", "gsissh", "ssh"],
"mandatory_args" : ["project"],
- "gsissh" :
- {
- "job_manager_endpoint" : "slurm+gsissh://stampede2.tacc.utexas.edu:2222/",
- "filesystem_endpoint" : "gsisftp://stampede2.tacc.utexas.edu:2222/"
- },
- "ssh" :
- {
- "job_manager_endpoint" : "slurm+ssh://stampede2.tacc.utexas.edu/",
- "filesystem_endpoint" : "sftp://stampede2.tacc.utexas.edu/"
- },
- "local" :
- {
- "job_manager_endpoint" : "slurm://stampede2.tacc.utexas.edu/",
- "filesystem_endpoint" : "file://stampede2.tacc.utexas.edu/"
+ "default_schema" : "local",
+ "schemas" : {
+ "gsissh" : {
+ "job_manager_endpoint": "slurm+gsissh://stampede2.tacc.utexas.edu:2222/",
+ "filesystem_endpoint" : "gsisftp://stampede2.tacc.utexas.edu:2222/"
+ },
+ "ssh" : {
+ "job_manager_endpoint": "slurm+ssh://stampede2.tacc.utexas.edu/",
+ "filesystem_endpoint" : "sftp://stampede2.tacc.utexas.edu/"
+ },
+ "local" : {
+ "job_manager_endpoint": "slurm://stampede2.tacc.utexas.edu/",
+ "filesystem_endpoint" : "file://stampede2.tacc.utexas.edu/"
+ }
},
"default_queue" : "normal",
"resource_manager" : "SLURM",
@@ -89,22 +88,21 @@
"stampede2_mpirun": {
"description" : "The ACCESS 'Stampede' cluster at TACC (https://docs.tacc.utexas.edu/hpc/stampede2/).",
"notes" : "Always set the ``project`` attribute in the PilotDescription or the pilot will fail.",
- "schemas" : ["local", "gsissh", "ssh"],
"mandatory_args" : ["project"],
- "gsissh" :
- {
- "job_manager_endpoint" : "slurm+gsissh://stampede2.tacc.utexas.edu:2222/",
- "filesystem_endpoint" : "gsisftp://stampede2.tacc.utexas.edu:2222/"
- },
- "ssh" :
- {
- "job_manager_endpoint" : "slurm+ssh://stampede2.tacc.utexas.edu/",
- "filesystem_endpoint" : "sftp://stampede2.tacc.utexas.edu/"
- },
- "local" :
- {
- "job_manager_endpoint" : "slurm://stampede2.tacc.utexas.edu/",
- "filesystem_endpoint" : "file://stampede2.tacc.utexas.edu/"
+ "default_schema" : "local",
+ "schemas" : {
+ "gsissh" : {
+ "job_manager_endpoint": "slurm+gsissh://stampede2.tacc.utexas.edu:2222/",
+ "filesystem_endpoint" : "gsisftp://stampede2.tacc.utexas.edu:2222/"
+ },
+ "ssh" : {
+ "job_manager_endpoint": "slurm+ssh://stampede2.tacc.utexas.edu/",
+ "filesystem_endpoint" : "sftp://stampede2.tacc.utexas.edu/"
+ },
+ "local" : {
+ "job_manager_endpoint": "slurm://stampede2.tacc.utexas.edu/",
+ "filesystem_endpoint" : "file://stampede2.tacc.utexas.edu/"
+ }
},
"default_queue" : "normal",
"resource_manager" : "SLURM",
@@ -133,22 +131,21 @@
"stampede2_ibrun_repex": {
"description" : "The ACCESS 'Stampede' cluster at TACC (https://docs.tacc.utexas.edu/hpc/stampede2/).",
"notes" : "Always set the ``project`` attribute in the PilotDescription or the pilot will fail.",
- "schemas" : ["local", "gsissh", "ssh"],
"mandatory_args" : ["project"],
- "gsissh" :
- {
- "job_manager_endpoint" : "slurm+gsissh://stampede2.tacc.utexas.edu:2222/",
- "filesystem_endpoint" : "gsisftp://stampede2.tacc.utexas.edu:2222/"
- },
- "ssh" :
- {
- "job_manager_endpoint" : "slurm+ssh://stampede2.tacc.utexas.edu/",
- "filesystem_endpoint" : "sftp://stampede2.tacc.utexas.edu/"
- },
- "local" :
- {
- "job_manager_endpoint" : "slurm://stampede2.tacc.utexas.edu/",
- "filesystem_endpoint" : "file://stampede2.tacc.utexas.edu/"
+ "default_schema" : "local",
+ "schemas" : {
+ "gsissh" : {
+ "job_manager_endpoint": "slurm+gsissh://stampede2.tacc.utexas.edu:2222/",
+ "filesystem_endpoint" : "gsisftp://stampede2.tacc.utexas.edu:2222/"
+ },
+ "ssh" : {
+ "job_manager_endpoint": "slurm+ssh://stampede2.tacc.utexas.edu/",
+ "filesystem_endpoint" : "sftp://stampede2.tacc.utexas.edu/"
+ },
+ "local" : {
+ "job_manager_endpoint": "slurm://stampede2.tacc.utexas.edu/",
+ "filesystem_endpoint" : "file://stampede2.tacc.utexas.edu/"
+ }
},
"cores_per_node" : 68,
"default_queue" : "normal",
@@ -181,22 +178,21 @@
"stampede2_ibrun": {
"description" : "The ACCESS 'Stampede' cluster at TACC (https://docs.tacc.utexas.edu/hpc/stampede2/).",
"notes" : "Always set the ``project`` attribute in the ComputePilotDescription or the pilot will fail.",
- "schemas" : ["local", "gsissh", "ssh"],
"mandatory_args" : ["project"],
- "gsissh" :
- {
- "job_manager_endpoint" : "slurm+gsissh://stampede2.tacc.utexas.edu:2222/",
- "filesystem_endpoint" : "gsisftp://stampede2.tacc.utexas.edu:2222/"
- },
- "ssh" :
- {
- "job_manager_endpoint" : "slurm+ssh://stampede2.tacc.utexas.edu/",
- "filesystem_endpoint" : "sftp://stampede2.tacc.utexas.edu/"
- },
- "local" :
- {
- "job_manager_endpoint" : "slurm://stampede2.tacc.utexas.edu/",
- "filesystem_endpoint" : "file://stampede2.tacc.utexas.edu/"
+ "default_schema" : "local",
+ "schemas" : {
+ "gsissh" : {
+ "job_manager_endpoint": "slurm+gsissh://stampede2.tacc.utexas.edu:2222/",
+ "filesystem_endpoint" : "gsisftp://stampede2.tacc.utexas.edu:2222/"
+ },
+ "ssh" : {
+ "job_manager_endpoint": "slurm+ssh://stampede2.tacc.utexas.edu/",
+ "filesystem_endpoint" : "sftp://stampede2.tacc.utexas.edu/"
+ },
+ "local" : {
+ "job_manager_endpoint": "slurm://stampede2.tacc.utexas.edu/",
+ "filesystem_endpoint" : "file://stampede2.tacc.utexas.edu/"
+ }
},
"cores_per_node" : 68,
"default_queue" : "normal",
@@ -226,22 +222,21 @@
"stampede2_srun": {
"description" : "The ACCESS 'Stampede' cluster at TACC (https://docs.tacc.utexas.edu/hpc/stampede2/).",
"notes" : "Always set the ``project`` attribute in the PilotDescription or the pilot will fail.",
- "schemas" : ["local", "gsissh", "ssh"],
"mandatory_args" : ["project"],
- "gsissh" :
- {
- "job_manager_endpoint" : "slurm+gsissh://stampede2.tacc.utexas.edu:2222/",
- "filesystem_endpoint" : "gsisftp://stampede2.tacc.utexas.edu:2222/"
- },
- "ssh" :
- {
- "job_manager_endpoint" : "slurm+ssh://stampede2.tacc.utexas.edu/",
- "filesystem_endpoint" : "sftp://stampede2.tacc.utexas.edu/"
- },
- "local" :
- {
- "job_manager_endpoint" : "slurm://stampede2.tacc.utexas.edu/",
- "filesystem_endpoint" : "file://stampede2.tacc.utexas.edu/"
+ "default_schema" : "local",
+ "schemas" : {
+ "gsissh" : {
+ "job_manager_endpoint": "slurm+gsissh://stampede2.tacc.utexas.edu:2222/",
+ "filesystem_endpoint" : "gsisftp://stampede2.tacc.utexas.edu:2222/"
+ },
+ "ssh" : {
+ "job_manager_endpoint": "slurm+ssh://stampede2.tacc.utexas.edu/",
+ "filesystem_endpoint" : "sftp://stampede2.tacc.utexas.edu/"
+ },
+ "local" : {
+ "job_manager_endpoint": "slurm://stampede2.tacc.utexas.edu/",
+ "filesystem_endpoint" : "file://stampede2.tacc.utexas.edu/"
+ }
},
"default_queue" : "normal",
"resource_manager" : "SLURM",
@@ -271,12 +266,13 @@
"comet": {
"description" : "The retired Comet HPC resource at SDSC 'HPC for the 99%%' (https://www.sdsc.edu/support/user_guides/comet.html).",
"notes" : "Always set the ``project`` attribute in the PilotDescription or the pilot will fail.",
- "schemas" : ["ssh"],
"mandatory_args" : ["project"],
- "ssh" :
- {
- "job_manager_endpoint" : "slurm+ssh://comet.sdsc.xsede.org/",
- "filesystem_endpoint" : "sftp://comet.sdsc.xsede.org/"
+ "default_schema" : "ssh",
+ "schemas" : {
+ "ssh" : {
+ "job_manager_endpoint": "slurm+ssh://comet.sdsc.xsede.org/",
+ "filesystem_endpoint" : "sftp://comet.sdsc.xsede.org/"
+ }
},
"default_queue" : "compute",
"lfs_path_per_node" : "/scratch/$USER/$SLURM_JOB_ID",
@@ -306,27 +302,25 @@
"bridges2": {
"description" : "The ACCESS 'Bridges2' cluster at PSC (https://www.psc.edu/resources/bridges-2/user-guide-2-2/).",
"notes" : "Always set the ``project`` attribute in the PilotDescription.",
- "schemas" : ["local", "interactive", "gsissh", "ssh"],
# "mandatory_args" : [],
- "local" :
- {
- "job_manager_endpoint" : "slurm://bridges2.psc.xsede.org/",
- "filesystem_endpoint" : "file://bridges2.psc.xsede.org/"
- },
- "interactive" :
- {
- "job_manager_endpoint" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
- },
- "gsissh" :
- {
- "job_manager_endpoint" : "slurm+gsissh://bridges2.psc.xsede.org:2222/",
- "filesystem_endpoint" : "gsisftp://bridges2.psc.xsede.org:2222/"
- },
- "ssh" :
- {
- "job_manager_endpoint" : "slurm+ssh://bridges2.psc.xsede.org/",
- "filesystem_endpoint" : "sftp://bridges2.psc.xsede.org/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "slurm://bridges2.psc.xsede.org/",
+ "filesystem_endpoint" : "file://bridges2.psc.xsede.org/"
+ },
+ "interactive" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ },
+ "gsissh" : {
+ "job_manager_endpoint": "slurm+gsissh://bridges2.psc.xsede.org:2222/",
+ "filesystem_endpoint" : "gsisftp://bridges2.psc.xsede.org:2222/"
+ },
+ "ssh" : {
+ "job_manager_endpoint": "slurm+ssh://bridges2.psc.xsede.org/",
+ "filesystem_endpoint" : "sftp://bridges2.psc.xsede.org/"
+ }
},
"default_queue" : "RM",
"resource_manager" : "SLURM",
diff --git a/src/radical/pilot/configs/resource_anl.json b/src/radical/pilot/configs/resource_anl.json
index 41dfc795e3..1530254ce0 100644
--- a/src/radical/pilot/configs/resource_anl.json
+++ b/src/radical/pilot/configs/resource_anl.json
@@ -3,12 +3,13 @@
"theta": {
"description" : "Cray XC40, 4392 nodes (Intel KNL 7230)",
"notes" : "Local instance of MongoDB and pre-set VE should be used.",
- "schemas" : ["local"],
- "local" :
- {
- "job_manager_hop" : "cobalt://localhost/",
- "job_manager_endpoint" : "cobalt://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_hop" : "cobalt://localhost/",
+ "job_manager_endpoint": "cobalt://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
"default_queue" : "debug-flat-quad",
"resource_manager" : "COBALT",
@@ -33,12 +34,13 @@
"theta_gpu": {
"description" : "Extension of Theta, 24 NVIDIA DGX A100 nodes",
"notes" : "Local instance of MongoDB and pre-set VE should be used.",
- "schemas" : ["local"],
- "local" :
- {
- "job_manager_hop" : "cobalt://localhost/",
- "job_manager_endpoint" : "cobalt://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_hop" : "cobalt://localhost/",
+ "job_manager_endpoint": "cobalt://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
"default_queue" : "full-node",
"resource_manager" : "COBALT",
@@ -66,16 +68,16 @@
"polaris": {
"description" : "AMD EPYC Milan 7543P 32 core CPU with four Nvidia A100 GPUs, 560 nodes",
"notes" : "Local instance of MongoDB and pre-set VE should be used.",
- "schemas" : ["local", "interactive"],
- "local" :
- {
- "job_manager_endpoint" : "pbspro://localhost/",
- "filesystem_endpoint" : "file://localhost/"
- },
- "interactive" :
- {
- "job_manager_endpoint" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "pbspro://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ },
+ "interactive" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
"default_queue" : "debug-scaling",
"resource_manager" : "PBSPRO",
@@ -102,11 +104,13 @@
"arcticus": {
"description" : "JLSE Aurora testbed; 17x Coyote Pass nodes, 2x XeHP_SDV",
"notes" : "Duo two-factor login. Local instance of virtualenv should be used.",
- "schemas" : [ "local" ],
- "local" : {
- "job_manager_hop" : "cobalt://localhost/",
- "job_manager_endpoint" : "cobalt://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_hop" : "cobalt://localhost/",
+ "job_manager_endpoint": "cobalt://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
# "forward_tunnel_endpoint" : "jlselogin5",
"default_queue" : "full-node",
diff --git a/src/radical/pilot/configs/resource_csc.json b/src/radical/pilot/configs/resource_csc.json
index ec49551774..40af684570 100644
--- a/src/radical/pilot/configs/resource_csc.json
+++ b/src/radical/pilot/configs/resource_csc.json
@@ -1,28 +1,27 @@
{
- "mahti":
- {
+ "mahti": {
"description" : "1404 CPU nodes",
- "schemas" : ["local", "batch", "interactive"],
- "local" :
- {
- "job_manager_endpoint" : "slurm://mahti.csc.fi/",
- "filesystem_endpoint" : "file://mahti.csc.fi/"
- },
- "batch" : "interactive",
- "interactive" :
- {
- "job_manager_endpoint" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
- },
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "slurm://mahti.csc.fi/",
+ "filesystem_endpoint" : "file://mahti.csc.fi/"
+ },
+ "batch" : "interactive",
+ "interactive" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
+ },
"default_queue" : "test",
"resource_manager" : "SLURM",
"cores_per_node" : 64,
"gpus_per_node" : 0,
-
- "agent_config" : "default",
+
+ "agent_config" : "default",
"agent_scheduler" : "CONTINUOUS",
"agent_spawner" : "POPEN",
"default_remote_workdir" : "/scratch/%(pd.project)s",
@@ -31,12 +30,10 @@
"module load tykky"
],
"launch_methods" : {
- "order" : ["SRUN"],
- "SRUN" : {
- }
+ "order" : ["SRUN"],
+ "SRUN" : {}
},
- "python_dist" : "default",
"virtenv_mode" : "local"
}
}
diff --git a/src/radical/pilot/configs/resource_debug.json b/src/radical/pilot/configs/resource_debug.json
index 126c9dd101..d305878879 100644
--- a/src/radical/pilot/configs/resource_debug.json
+++ b/src/radical/pilot/configs/resource_debug.json
@@ -1,19 +1,21 @@
{
- "local": {
+ "summit": {
"description" : "",
"notes" : "",
- "schemas" : ["local"],
- "local" : {
- "job_manager_hop" : "fork://localhost/",
- "job_manager_endpoint" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_hop" : "fork://localhost/",
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
"default_queue" : "",
"resource_manager" : "FORK",
"agent_config" : "default_sa",
"agent_scheduler" : "CONTINUOUS",
- "agent_spawner" : "POPEN",
+ "agent_spawner" : "SLEEP",
"launch_methods" : {
"order" : ["FORK", "MPIRUN"],
"FORK" : {},
@@ -35,16 +37,16 @@
"test": {
"description" : "Your local machine.",
"notes" : "",
- "schemas" : ["local", "ssh"],
- "ssh" :
- {
- "job_manager_endpoint" : "ssh://localhost/",
- "filesystem_endpoint" : "sftp://localhost/"
- },
- "local" :
- {
- "job_manager_endpoint" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "ssh" : {
+ "job_manager_endpoint": "ssh://localhost/",
+ "filesystem_endpoint" : "sftp://localhost/"
+ },
+ "local" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
"pre_bootstrap_1" : [
"export RP_APP_TUNNEL_ADDR=144.76.72.175:27017",
@@ -75,16 +77,16 @@
"flux": {
"description" : "",
"notes" : "",
- "schemas" : ["local", "ssh"],
- "ssh" :
- {
- "job_manager_endpoint" : "ssh://localhost/",
- "filesystem_endpoint" : "sftp://localhost/"
- },
- "local" :
- {
- "job_manager_endpoint" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "ssh" : {
+ "job_manager_endpoint": "ssh://localhost/",
+ "filesystem_endpoint" : "sftp://localhost/"
+ },
+ "local" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
"default_remote_workdir" : "$HOME",
"resource_manager" : "FORK",
diff --git a/src/radical/pilot/configs/resource_llnl.json b/src/radical/pilot/configs/resource_llnl.json
index d46eaf5fc4..841c0792cc 100644
--- a/src/radical/pilot/configs/resource_llnl.json
+++ b/src/radical/pilot/configs/resource_llnl.json
@@ -2,12 +2,13 @@
"lassen": {
"description" : "Unclassified Sierra system (arch: IBM Power9, NVIDIA TeslaV100)",
"notes" : "A dedicated local instance of MongoDB should be used",
- "schemas" : ["local"],
- "local" :
- {
- "job_manager_hop" : "fork://localhost/",
- "job_manager_endpoint" : "lsf://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_hop" : "fork://localhost/",
+ "job_manager_endpoint": "lsf://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
# "forward_tunnel_endpoint" : "`hostname -f`",
"default_queue" : "pbatch",
diff --git a/src/radical/pilot/configs/resource_local.json b/src/radical/pilot/configs/resource_local.json
index 489cd0166c..159ef86825 100644
--- a/src/radical/pilot/configs/resource_local.json
+++ b/src/radical/pilot/configs/resource_local.json
@@ -3,16 +3,16 @@
"localhost": {
"description" : "Your local machine.",
"notes" : "To use the ssh schema, make sure that ssh access to localhost is enabled.",
- "schemas" : ["local", "ssh"],
- "ssh" :
- {
- "job_manager_endpoint" : "ssh://localhost/",
- "filesystem_endpoint" : "sftp://localhost/"
- },
- "local" :
- {
- "job_manager_endpoint" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "ssh" : {
+ "job_manager_endpoint": "ssh://localhost",
+ "filesystem_endpoint" : "sftp://localhost"
+ },
+ "local" : {
+ "job_manager_endpoint": "fork://localhost",
+ "filesystem_endpoint" : "file://localhost"
+ }
},
"default_remote_workdir" : "$HOME",
"resource_manager" : "FORK",
@@ -41,11 +41,12 @@
"localhost_test": {
"description" : "Your local machine.",
"notes" : "To use the ssh schema, make sure that ssh access to localhost is enabled.",
- "schemas" : ["local"],
- "local" :
- {
- "job_manager_endpoint" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
"default_remote_workdir" : "$HOME",
"resource_manager" : "FORK",
@@ -71,16 +72,16 @@
"localhost_anaconda": {
"description" : "Your local machine.",
"notes" : "To use the ssh schema, make sure that ssh access to localhost is enabled.",
- "schemas" : ["local", "ssh"],
- "ssh" :
- {
- "job_manager_endpoint" : "ssh://localhost/",
- "filesystem_endpoint" : "sftp://localhost/"
- },
- "local" :
- {
- "job_manager_endpoint" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "ssh" : {
+ "job_manager_endpoint": "ssh://localhost/",
+ "filesystem_endpoint" : "sftp://localhost/"
+ },
+ "local" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
"default_remote_workdir" : "$HOME",
"resource_manager" : "FORK",
diff --git a/src/radical/pilot/configs/resource_ncar.json b/src/radical/pilot/configs/resource_ncar.json
index ee8f0a2b15..e0649544e5 100644
--- a/src/radical/pilot/configs/resource_ncar.json
+++ b/src/radical/pilot/configs/resource_ncar.json
@@ -3,17 +3,18 @@
"cheyenne": {
"description" : "An SGI ICE XA Cluster located at the National Center for Atmospheric Research (NCAR), (https://www2.cisl.ucar.edu/resources/computational-systems/cheyenne)",
"notes" : "Requires the use of a token from an USB on every connection.",
- "schemas" : ["local", "ssh"],
- "local" : {
- "job_manager_hop" : "fork://localhost/",
- "job_manager_endpoint" : "pbspro://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_hop" : "fork://localhost/",
+ "job_manager_endpoint": "pbspro://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ },
+ "ssh" : {
+ "job_manager_endpoint": "ssh+pbspro://cheyenne.ucar.edu/",
+ "filesystem_endpoint" : "file://cheyenne.ucar.edu/"
+ }
},
- "ssh" : {
- "job_manager_endpoint" : "ssh+pbspro://cheyenne.ucar.edu/",
- "filesystem_endpoint" : "file://cheyenne.ucar.edu/"
- },
-
"default_queue" : "regular",
"resource_manager" : "PBSPRO",
"cores_per_node" : 36,
@@ -50,15 +51,17 @@
"cheyenne_mpt": {
"description" : "An SGI ICE XA Cluster located at the National Center for Atmospheric Research (NCAR), (https://www2.cisl.ucar.edu/resources/computational-systems/cheyenne)",
"notes" : "Requires the use of a token from an USB on every connection.",
- "schemas" : ["local", "ssh"],
- "local" : {
- "job_manager_hop" : "fork://localhost/",
- "job_manager_endpoint" : "pbspro://localhost/",
- "filesystem_endpoint" : "file://localhost/"
- },
- "ssh" : {
- "job_manager_endpoint" : "ssh+pbspro://cheyenne.ucar.edu/",
- "filesystem_endpoint" : "file://cheyenne.ucar.edu/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_hop" : "fork://localhost/",
+ "job_manager_endpoint": "pbspro://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ },
+ "ssh" : {
+ "job_manager_endpoint": "ssh+pbspro://cheyenne.ucar.edu/",
+ "filesystem_endpoint" : "file://cheyenne.ucar.edu/"
+ }
},
"default_queue" : "regular",
"resource_manager" : "PBSPRO",
diff --git a/src/radical/pilot/configs/resource_ncsa.json b/src/radical/pilot/configs/resource_ncsa.json
index 2948a402cf..6e2121d364 100644
--- a/src/radical/pilot/configs/resource_ncsa.json
+++ b/src/radical/pilot/configs/resource_ncsa.json
@@ -2,18 +2,21 @@
"delta":
{
"description" : "132 standard CPU (AMD EPYC 7763) nodes",
- "schemas" : ["local", "batch", "interactive"],
- "local" :
- {
- "job_manager_endpoint" : "slurm://delta.ncsa.illinois.edu/",
- "filesystem_endpoint" : "file://delta.ncsa.illinois.edu/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "slurm://delta.ncsa.illinois.edu/",
+ "filesystem_endpoint" : "file://delta.ncsa.illinois.edu/"
+ },
+ "batch" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ },
+ "interactive" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
- "batch" :
- {
- "job_manager_endpoint" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
- },
- "interactive" : "batch",
"default_queue" : "cpu",
"cores_per_node" : 128,
"resource_manager" : "SLURM",
@@ -34,18 +37,21 @@
"delta_gpu_a40":
{
"description" : "100 4-way A40-based GPU nodes",
- "schemas" : ["local", "batch", "interactive"],
- "local" :
- {
- "job_manager_endpoint" : "slurm://delta.ncsa.illinois.edu/",
- "filesystem_endpoint" : "file://delta.ncsa.illinois.edu/"
- },
- "batch" :
- {
- "job_manager_endpoint" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "slurm://delta.ncsa.illinois.edu/",
+ "filesystem_endpoint" : "file://delta.ncsa.illinois.edu/"
+ },
+ "batch" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ },
+ "interactive" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
- "interactive" : "batch",
"default_queue" : "gpuA40x4",
"cores_per_node" : 64,
"gpus_per_node" : 4,
@@ -67,18 +73,21 @@
"delta_gpu_a100_4way":
{
"description" : "100 4-way A100-based GPU nodes",
- "schemas" : ["local", "batch", "interactive"],
- "local" :
- {
- "job_manager_endpoint" : "slurm://delta.ncsa.illinois.edu/",
- "filesystem_endpoint" : "file://delta.ncsa.illinois.edu/"
- },
- "batch" :
- {
- "job_manager_endpoint" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "slurm://delta.ncsa.illinois.edu/",
+ "filesystem_endpoint" : "file://delta.ncsa.illinois.edu/"
+ },
+ "batch" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ },
+ "interactive" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
- "interactive" : "batch",
"default_queue" : "gpuA100x4",
"cores_per_node" : 64,
"gpus_per_node" : 4,
@@ -100,18 +109,21 @@
"delta_gpu_a100_8way":
{
"description" : "6 8-way A100-based GPU nodes",
- "schemas" : ["local", "batch", "interactive"],
- "local" :
- {
- "job_manager_endpoint" : "slurm://delta.ncsa.illinois.edu/",
- "filesystem_endpoint" : "file://delta.ncsa.illinois.edu/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "slurm://delta.ncsa.illinois.edu/",
+ "filesystem_endpoint" : "file://delta.ncsa.illinois.edu/"
+ },
+ "batch" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ },
+ "interactive" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
- "batch" :
- {
- "job_manager_endpoint" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
- },
- "interactive" : "batch",
"default_queue" : "gpuA100x8",
"cores_per_node" : 128,
"gpus_per_node" : 8,
@@ -133,18 +145,21 @@
"delta_gpu_mi100":
{
"description" : "1 8-way MI100-based GPU node",
- "schemas" : ["local", "batch", "interactive"],
- "local" :
- {
- "job_manager_endpoint" : "slurm://delta.ncsa.illinois.edu/",
- "filesystem_endpoint" : "file://delta.ncsa.illinois.edu/"
- },
- "batch" :
- {
- "job_manager_endpoint" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "slurm://delta.ncsa.illinois.edu/",
+ "filesystem_endpoint" : "file://delta.ncsa.illinois.edu/"
+ },
+ "batch" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ },
+ "interactive" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
- "interactive" : "batch",
"default_queue" : "gpuMI100x8",
"cores_per_node" : 128,
"gpus_per_node" : 8,
diff --git a/src/radical/pilot/configs/resource_nersc.json b/src/radical/pilot/configs/resource_nersc.json
index 757b68808c..fa7365bfb1 100644
--- a/src/radical/pilot/configs/resource_nersc.json
+++ b/src/radical/pilot/configs/resource_nersc.json
@@ -2,17 +2,22 @@
{
"perlmutter": {
"description" : "CPU nodes: 3072",
- "schemas" : ["local", "interactive", "batch"],
- "local" : {
- "job_manager_endpoint" : "slurm://perlmutter-p1.nersc.gov/",
- "filesystem_endpoint" : "file://perlmutter-p1.nersc.gov/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "slurm://perlmutter-p1.nersc.gov/",
+ "filesystem_endpoint" : "file://perlmutter-p1.nersc.gov/"
+ },
+ "batch" :
+ {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ },
+ "interactive" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
- "batch" :
- {
- "job_manager_endpoint" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
- },
- "interactive" : "batch",
"default_queue" : "regular",
"resource_manager" : "SLURM",
"agent_scheduler" : "CONTINUOUS",
@@ -36,17 +41,21 @@
"perlmutter_gpu": {
"description" : "GPU nodes: 1536 with 40GiB and 256 with 80GiB of GPU-attached memory",
- "schemas" : ["local", "interactive", "batch"],
- "local" : {
- "job_manager_endpoint" : "slurm://perlmutter-p1.nersc.gov/",
- "filesystem_endpoint" : "file://perlmutter-p1.nersc.gov/"
- },
- "batch" :
- {
- "job_manager_endpoint" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "slurm://perlmutter-p1.nersc.gov/",
+ "filesystem_endpoint" : "file://perlmutter-p1.nersc.gov/"
+ },
+ "batch" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ },
+ "interactive" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
- "interactive" : "batch",
"default_queue" : "regular",
"resource_manager" : "SLURM",
"agent_scheduler" : "CONTINUOUS",
diff --git a/src/radical/pilot/configs/resource_ornl.json b/src/radical/pilot/configs/resource_ornl.json
index 42e7e3ffc8..af2afe82d7 100644
--- a/src/radical/pilot/configs/resource_ornl.json
+++ b/src/radical/pilot/configs/resource_ornl.json
@@ -3,10 +3,12 @@
"andes": {
"description" : "704 compute nodes",
"notes" : "Requires RSA SecurID and uses local virtual env",
- "schemas" : ["local"],
- "local" : {
- "job_manager_endpoint" : "slurm://andes.olcf.ornl.gov/",
- "filesystem_endpoint" : "file://andes.olcf.ornl.gov/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "slurm://andes.olcf.ornl.gov/",
+ "filesystem_endpoint" : "file://andes.olcf.ornl.gov/"
+ }
},
"forward_tunnel_endpoint" : "andes.olcf.ornl.gov",
"default_queue" : "batch",
@@ -28,10 +30,12 @@
"andes_gpu": {
"description" : "9 gpu nodes",
"notes" : "Requires RSA SecurID and uses local virtual env",
- "schemas" : ["local"],
- "local" : {
- "job_manager_endpoint" : "slurm://andes.olcf.ornl.gov/",
- "filesystem_endpoint" : "file://andes.olcf.ornl.gov/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "slurm://andes.olcf.ornl.gov/",
+ "filesystem_endpoint" : "file://andes.olcf.ornl.gov/"
+ }
},
"forward_tunnel_endpoint" : "andes.olcf.ornl.gov",
"default_queue" : "gpu",
@@ -55,10 +59,12 @@
"crusher": {
"description" : "2 cabinets: (1) 128 compute nodes; (2) 64 compute nodes",
"notes" : "Requires RSA SecurID and uses prepared local virtual env",
- "schemas" : ["local"],
- "local" : {
- "job_manager_endpoint" : "slurm://crusher.olcf.ornl.gov/",
- "filesystem_endpoint" : "file://crusher.olcf.ornl.gov/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "slurm://crusher.olcf.ornl.gov/",
+ "filesystem_endpoint" : "file://crusher.olcf.ornl.gov/"
+ }
},
"default_queue" : "batch",
"resource_manager" : "SLURM",
@@ -91,10 +97,12 @@
"frontier": {
"description" : "74 cabinets: 128 compute nodes",
"notes" : "Requires RSA SecurID and uses prepared local virtual env",
- "schemas" : ["local"],
- "local" : {
- "job_manager_endpoint" : "slurm://frontier.olcf.ornl.gov/",
- "filesystem_endpoint" : "file://frontier.olcf.ornl.gov/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "slurm://frontier.olcf.ornl.gov/",
+ "filesystem_endpoint" : "file://frontier.olcf.ornl.gov/"
+ }
},
"default_queue" : "batch",
"resource_manager" : "SLURM",
@@ -123,10 +131,12 @@
"spock": {
"description" : "3 cabinets: each containing 12 compute nodes",
"notes" : "",
- "schemas" : ["local"],
- "local" : {
- "job_manager_endpoint" : "slurm://spock.olcf.ornl.gov/",
- "filesystem_endpoint" : "file://spock.olcf.ornl.gov/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "slurm://spock.olcf.ornl.gov/",
+ "filesystem_endpoint" : "file://spock.olcf.ornl.gov/"
+ }
},
"default_queue" : "ecp",
"resource_manager" : "SLURM",
@@ -159,11 +169,13 @@
"summit": {
"description" : "4608 nodes with 2 IBM POWER9 CPUs and 6 NVIDIA Volta V100 GPUs",
"notes" : "Requires RSA SecurID and uses local virtual env",
- "schemas" : ["local"],
- "local" : {
- "job_manager_hop" : "fork://localhost/",
- "job_manager_endpoint" : "lsf://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_hop" : "fork://localhost/",
+ "job_manager_endpoint": "lsf://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
"default_queue" : "batch",
"resource_manager" : "LSF",
@@ -196,17 +208,19 @@
"blocked_cores" : [],
"blocked_gpus" : []
},
- "task_pre_exec" : ["export LD_LIBRARY_PATH=/sw/summit/spack-envs/base/opt/linux-rhel8-ppc64le/gcc-9.1.0/spectrum-mpi-10.4.0.3-20210112-6jbupg3thjwhsabgevk6xmwhd2bbyxdc/container/../lib/pami_port:${LD_LIBRARY_PATH}"]
+ "task_pre_exec" : ["export LD_LIBRARY_PATH=/sw/summit/spack-envs/base/opt/linux-rhel8-ppc64le/gcc-9.1.0/spectrum-mpi-10.4.0.3-20210112-6jbupg3thjwhsabgevk6xmwhd2bbyxdc/container/../lib/pami_port:$LD_LIBRARY_PATH"]
},
"summit_jsrun": {
"description" : "4608 nodes with 2 IBM POWER9 CPUs and 6 NVIDIA Volta V100 GPUs",
"notes" : "Requires RSA SecurID and uses local virtual env",
- "schemas" : ["local"],
- "local" : {
- "job_manager_hop" : "fork://localhost/",
- "job_manager_endpoint" : "lsf://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_hop" : "fork://localhost/",
+ "job_manager_endpoint": "lsf://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
"default_queue" : "batch",
"resource_manager" : "LSF",
@@ -243,16 +257,18 @@
"blocked_cores" : [],
"blocked_gpus" : []
},
- "task_pre_exec" : ["export LD_LIBRARY_PATH=/sw/summit/spack-envs/base/opt/linux-rhel8-ppc64le/gcc-9.1.0/spectrum-mpi-10.4.0.3-20210112-6jbupg3thjwhsabgevk6xmwhd2bbyxdc/container/../lib/pami_port:${LD_LIBRARY_PATH}"]
+ "task_pre_exec" : ["export LD_LIBRARY_PATH=/sw/summit/spack-envs/base/opt/linux-rhel8-ppc64le/gcc-9.1.0/spectrum-mpi-10.4.0.3-20210112-6jbupg3thjwhsabgevk6xmwhd2bbyxdc/container/../lib/pami_port:$LD_LIBRARY_PATH"]
},
"summit_interactive": {
"description" : "4608 nodes with 2 IBM POWER9 CPUs and 6 NVIDIA Volta V100 GPUs",
"notes" : "interactive job https://docs.olcf.ornl.gov/systems/summit_user_guide.html",
- "schemas" : ["interactive"],
- "interactive" : {
- "job_manager_endpoint" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "interactive",
+ "schemas" : {
+ "interactive" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
"resource_manager" : "LSF",
"agent_config" : "default",
@@ -290,11 +306,13 @@
"summit_prte": {
"description" : "4608 nodes with 2 IBM POWER9 CPUs and 6 NVIDIA Volta V100 GPUs",
"notes" : "Requires RSA SecurID and uses local virtual env",
- "schemas" : ["local"],
- "local" : {
- "job_manager_hop" : "fork://localhost/",
- "job_manager_endpoint" : "lsf://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_hop" : "fork://localhost/",
+ "job_manager_endpoint": "lsf://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
"default_queue" : "batch",
"resource_manager" : "LSF",
diff --git a/src/radical/pilot/configs/resource_princeton.json b/src/radical/pilot/configs/resource_princeton.json
index 2af22c45b5..a984227f22 100644
--- a/src/radical/pilot/configs/resource_princeton.json
+++ b/src/radical/pilot/configs/resource_princeton.json
@@ -3,13 +3,14 @@
"traverse": {
"description" : "",
"notes" : "",
- "schemas" : ["local"],
"mandatory_args" : [],
- "local" :
- {
- "job_manager_endpoint" : "slurm://traverse.princeton.edu/",
- "job_manager_hop" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "slurm://traverse.princeton.edu/",
+ "job_manager_hop" : "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
"default_queue" : "test",
"resource_manager" : "SLURM",
@@ -40,13 +41,14 @@
"traverse_mpirun": {
"description" : "",
"notes" : "",
- "schemas" : ["local"],
"mandatory_args" : [],
- "local" :
- {
- "job_manager_endpoint" : "slurm://traverse.princeton.edu/",
- "job_manager_hop" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "slurm://traverse.princeton.edu/",
+ "job_manager_hop" : "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
"default_queue" : "test",
"resource_manager" : "SLURM",
@@ -75,18 +77,18 @@
"tiger_cpu": {
"description" : "",
"notes" : "",
- "schemas" : ["local", "ssh"],
"mandatory_args" : [],
- "local" :
- {
- "job_manager_endpoint" : "slurm://tigercpu.princeton.edu/",
- "job_manager_hop" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
- },
- "ssh" :
- {
- "job_manager_endpoint" : "slurm+ssh://tigercpu.princeton.edu/",
- "filesystem_endpoint" : "sftp://tigercpu.princeton.edu/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "slurm://tigercpu.princeton.edu/",
+ "job_manager_hop" : "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ },
+ "ssh" : {
+ "job_manager_endpoint": "slurm+ssh://tigercpu.princeton.edu/",
+ "filesystem_endpoint" : "sftp://tigercpu.princeton.edu/"
+ }
},
"default_queue" : "cpu",
"resource_manager" : "SLURM",
@@ -123,18 +125,18 @@
"tiger_gpu": {
"description" : "",
"notes" : "",
- "schemas" : ["local", "ssh"],
"mandatory_args" : [],
- "local" :
- {
- "job_manager_endpoint" : "slurm://tigercpu.princeton.edu/",
- "job_manager_hop" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
- },
- "ssh" :
- {
- "job_manager_endpoint" : "slurm+ssh://tigergpu.princeton.edu/",
- "filesystem_endpoint" : "sftp://tigergpu.princeton.edu/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "slurm://tigercpu.princeton.edu/",
+ "job_manager_hop" : "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ },
+ "ssh" : {
+ "job_manager_endpoint": "slurm+ssh://tigergpu.princeton.edu/",
+ "filesystem_endpoint" : "sftp://tigergpu.princeton.edu/"
+ }
},
"default_queue" : "gpu",
"resource_manager" : "SLURM",
diff --git a/src/radical/pilot/configs/resource_rutgers.json b/src/radical/pilot/configs/resource_rutgers.json
index 86d7f85730..299a4bcbd3 100644
--- a/src/radical/pilot/configs/resource_rutgers.json
+++ b/src/radical/pilot/configs/resource_rutgers.json
@@ -3,21 +3,20 @@
{
"description" : "Heterogeneous community-model Linux cluster",
"notes" : "Access from registered IP address",
- "schemas" : ["local", "ssh", "interactive"],
- "local" :
- {
- "job_manager_endpoint" : "slurm://amarel.rutgers.edu/",
- "filesystem_endpoint" : "file://amarel.rutgers.edu/"
- },
- "ssh" :
- {
- "job_manager_endpoint" : "slurm+ssh://amarel.rutgers.edu/",
- "filesystem_endpoint" : "sftp://amarel.rutgers.edu/"
- },
- "interactive" :
- {
- "job_manager_endpoint" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "slurm://amarel.rutgers.edu/",
+ "filesystem_endpoint" : "file://amarel.rutgers.edu/"
+ },
+ "ssh" : {
+ "job_manager_endpoint": "slurm+ssh://amarel.rutgers.edu/",
+ "filesystem_endpoint" : "sftp://amarel.rutgers.edu/"
+ },
+ "interactive" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
"default_queue" : "main",
"resource_manager" : "SLURM",
diff --git a/src/radical/pilot/configs/resource_tacc.json b/src/radical/pilot/configs/resource_tacc.json
index 734081b445..398c6beef4 100644
--- a/src/radical/pilot/configs/resource_tacc.json
+++ b/src/radical/pilot/configs/resource_tacc.json
@@ -4,22 +4,21 @@
"frontera": {
"description" : "Petascale computing system at the Texas Advanced Computing Center (TACC)",
"notes" : "Always launch RP from a login node or within a compute (interactive) node if you do not have a waiver from TACC for an external IP address",
- "schemas" : ["local", "ssh", "interactive"],
"mandatory_args" : ["project"],
- "local" :
- {
- "job_manager_endpoint" : "slurm://frontera.tacc.utexas.edu/",
- "filesystem_endpoint" : "file://frontera.tacc.utexas.edu/"
- },
- "ssh" :
- {
- "job_manager_endpoint" : "slurm+ssh://frontera.tacc.utexas.edu/",
- "filesystem_endpoint" : "sftp://frontera.tacc.utexas.edu/"
- },
- "interactive" :
- {
- "job_manager_endpoint" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "slurm://frontera.tacc.utexas.edu/",
+ "filesystem_endpoint" : "file://frontera.tacc.utexas.edu/"
+ },
+ "ssh" : {
+ "job_manager_endpoint": "slurm+ssh://frontera.tacc.utexas.edu/",
+ "filesystem_endpoint" : "sftp://frontera.tacc.utexas.edu/"
+ },
+ "interactive" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
"default_queue" : "normal",
"resource_manager" : "SLURM",
@@ -57,17 +56,17 @@
"frontera_rtx": {
"description" : "Petascale computing system at the Texas Advanced Computing Center (TACC)",
"notes" : "Always launch RP from a login node if you do not have a waiver from TACC for an external IP address",
- "schemas" : ["local", "ssh"],
"mandatory_args" : ["project"],
- "ssh" :
- {
- "job_manager_endpoint" : "slurm+ssh://frontera.tacc.utexas.edu/",
- "filesystem_endpoint" : "sftp://frontera.tacc.utexas.edu/"
- },
- "local" :
- {
- "job_manager_endpoint" : "slurm://frontera.tacc.utexas.edu/",
- "filesystem_endpoint" : "file://frontera.tacc.utexas.edu/"
+ "default_schema" : "local",
+ "schemas" : {
+ "ssh" : {
+ "job_manager_endpoint": "slurm+ssh://frontera.tacc.utexas.edu/",
+ "filesystem_endpoint" : "sftp://frontera.tacc.utexas.edu/"
+ },
+ "local" : {
+ "job_manager_endpoint": "slurm://frontera.tacc.utexas.edu/",
+ "filesystem_endpoint" : "file://frontera.tacc.utexas.edu/"
+ }
},
"default_queue" : "rtx",
"resource_manager" : "SLURM",
@@ -100,17 +99,17 @@
"frontera_prte": {
"description" : "Petascale computing system at the Texas Advanced Computing Center (TACC)",
"notes" : "Always launch RP from a login node if you do not have a waiver from TACC for an external IP address",
- "schemas" : ["local", "ssh"],
"mandatory_args" : ["project"],
- "ssh" :
- {
- "job_manager_endpoint" : "slurm+ssh://frontera.tacc.utexas.edu/",
- "filesystem_endpoint" : "sftp://frontera.tacc.utexas.edu/"
- },
- "local" :
- {
- "job_manager_endpoint" : "slurm://frontera.tacc.utexas.edu/",
- "filesystem_endpoint" : "file://frontera.tacc.utexas.edu/"
+ "default_schema" : "local",
+ "schemas" : {
+ "ssh" : {
+ "job_manager_endpoint": "slurm+ssh://frontera.tacc.utexas.edu/",
+ "filesystem_endpoint" : "sftp://frontera.tacc.utexas.edu/"
+ },
+ "local" : {
+ "job_manager_endpoint": "slurm://frontera.tacc.utexas.edu/",
+ "filesystem_endpoint" : "file://frontera.tacc.utexas.edu/"
+ }
},
"default_queue" : "normal",
"resource_manager" : "SLURM",
diff --git a/src/radical/pilot/configs/resource_uva.json b/src/radical/pilot/configs/resource_uva.json
index acd760e322..adacac899d 100644
--- a/src/radical/pilot/configs/resource_uva.json
+++ b/src/radical/pilot/configs/resource_uva.json
@@ -3,21 +3,20 @@
{
"description" : "Heterogeneous community-model Linux cluster",
"notes" : "Access from registered UVA IP address. See https://www.rc.virginia.edu/userinfo/rivanna/login/",
- "schemas" : ["local", "ssh", "interactive"],
- "local" :
- {
- "job_manager_endpoint" : "slurm://rivanna.hpc.virginia.edu/",
- "filesystem_endpoint" : "file://rivanna.hpc.virginia.edu/"
- },
- "ssh" :
- {
- "job_manager_endpoint" : "slurm+ssh://rivanna.hpc.virginia.edu/",
- "filesystem_endpoint" : "sftp://rivanna.hpc.virginia.edu/"
- },
- "interactive" :
- {
- "job_manager_endpoint" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "slurm://rivanna.hpc.virginia.edu/",
+ "filesystem_endpoint" : "file://rivanna.hpc.virginia.edu/"
+ },
+ "ssh" : {
+ "job_manager_endpoint": "slurm+ssh://rivanna.hpc.virginia.edu/",
+ "filesystem_endpoint" : "sftp://rivanna.hpc.virginia.edu/"
+ },
+ "interactive" : {
+ "job_manager_endpoint": "fork://localhost/",
+ "filesystem_endpoint" : "file://localhost/"
+ }
},
"default_queue" : "standard",
"resource_manager" : "SLURM",
diff --git a/src/radical/pilot/configs/session_default.json b/src/radical/pilot/configs/session_default.json
index e6481bb34e..03910c3ee4 100644
--- a/src/radical/pilot/configs/session_default.json
+++ b/src/radical/pilot/configs/session_default.json
@@ -3,12 +3,10 @@
# specified. It contains the minimal set of settings required for
# a functional rp session, both on the client and on the agent side.
{
- "dburl" : "${RADICAL_PILOT_DBURL}",
+ "proxy_url" : "${RADICAL_PILOT_PROXY_URL}",
+ "proxy_host" : "${RADICAL_PILOT_PROXY_HOST:localhost}",
"session_base" : "${RADICAL_PILOT_SESSION_BASE:$PWD}",
- "record" : "${RADICAL_PILOT_SESSION_RECORD}",
-
- "bulk_time" : 1.0,
- "bulk_size" : 1024,
+ "base" : "${RADICAL_PILOT_BASE:$PWD}",
"heartbeat" : {
"interval" : 1.0,
@@ -16,33 +14,17 @@
},
"bridges" : {
- "log_pubsub" : {"kind" : "pubsub",
- "log_level" : "error",
- "stall_hwm" : 1,
- "bulk_size" : 0},
- "state_pubsub" : {"kind" : "pubsub",
- "log_level" : "error",
- "stall_hwm" : 1,
- "bulk_size" : 0},
- "control_pubsub" : {"kind" : "pubsub",
- "log_level" : "error",
- "stall_hwm" : 1,
- "bulk_size" : 0},
+ "log_pubsub" : {"kind": "pubsub"},
+ "state_pubsub" : {"kind": "pubsub"},
+ "control_pubsub" : {"kind": "pubsub"},
- "stager_request_queue" : {"kind" : "queue",
- "log_level" : "error",
- "stall_hwm" : 1,
- "bulk_size" : 0},
- "stager_response_pubsub": {"kind" : "pubsub",
- "log_level" : "error",
- "stall_hwm" : 1,
- "bulk_size" : 0}
+ "stager_request_queue" : {"kind": "queue" },
+ "stager_response_pubsub": {"kind": "pubsub"}
},
"components" : {
# how many instances of the respective components should be started
- "update" : { "count" : 1 },
- "stager" : { "count" : 1 }
+ "stager" : { "count": 1 }
}
}
diff --git a/src/radical/pilot/configs/tmgr_default.json b/src/radical/pilot/configs/tmgr_default.json
index 51a4417a54..6f096a41c6 100644
--- a/src/radical/pilot/configs/tmgr_default.json
+++ b/src/radical/pilot/configs/tmgr_default.json
@@ -3,43 +3,28 @@
# specified. It contains the minimal set of settings required for
# a functional task manager.
{
- # default scheduler
"scheduler" : "round_robin",
+ "bulk_size" : 4096,
+ "bulk_time" : 10.0,
- # max number of updates to put into a db bulk
- "bulk_collection_size" : 100,
+ "db_poll_sleeptime" : 10.0,
- # max time period to collect db notifications into bulks (seconds)
- "bulk_collection_time" : 1.0,
-
- # time to sleep between database polls (seconds)
- "db_poll_sleeptime" : 1.0,
+ "heartbeat" : {
+ "interval" : 3.0,
+ "timeout" : 10.0
+ },
# The threashold on which we decide to trigger bulk mkdir
"task_bulk_mkdir_threshold" : 16,
"bridges" : {
- "tmgr_staging_input_queue" : {"kind" : "queue",
- "log_level" : "error",
- "stall_hwm" : 0,
- "bulk_size" : 0},
- "tmgr_scheduling_queue" : {"kind" : "queue",
- "log_level" : "error",
- "stall_hwm" : 0,
- "bulk_size" : 0},
- "tmgr_staging_output_queue" : {"kind" : "queue",
- "log_level" : "error",
- "stall_hwm" : 0,
- "bulk_size" : 0},
-
- "tmgr_unschedule_pubsub" : {"kind" : "pubsub",
- "log_level" : "error",
- "stall_hwm" : 0,
- "bulk_size" : 0},
- "tmgr_reschedule_pubsub" : {"kind" : "pubsub",
- "log_level" : "error",
- "stall_hwm" : 0,
- "bulk_size" : 0}
+ "tmgr_staging_input_queue" : {"kind": "queue" },
+ "tmgr_scheduling_queue" : {"kind": "queue" },
+ "agent_staging_input_queue" : {"kind": "queue" },
+ "tmgr_staging_output_queue" : {"kind": "queue" }
+
+ # "tmgr_unschedule_pubsub" : {"kind": "pubsub"},
+ # "tmgr_reschedule_pubsub" : {"kind": "pubsub"}
},
"components" : {
diff --git a/src/radical/pilot/constants.py b/src/radical/pilot/constants.py
index 9493aafc55..70195def0b 100644
--- a/src/radical/pilot/constants.py
+++ b/src/radical/pilot/constants.py
@@ -6,8 +6,6 @@
MASTER = 'master'
WORKER = 'worker'
-UPDATE_WORKER = 'update'
-
STAGER_WORKER = 'stager'
STAGER_REQUEST_QUEUE = 'stager_request_queue'
STAGER_RESPONSE_PUBSUB = 'stager_response_pubsub'
@@ -23,10 +21,12 @@
TMGR_STAGING_INPUT_COMPONENT = 'tmgr_staging_input'
TMGR_STAGING_OUTPUT_COMPONENT = 'tmgr_staging_output'
+AGENT_STAGING_INPUT_PUBSUB = 'agent_staging_input_pubsub'
AGENT_STAGING_INPUT_QUEUE = 'agent_staging_input_queue'
AGENT_SCHEDULING_QUEUE = 'agent_scheduling_queue'
AGENT_EXECUTING_QUEUE = 'agent_executing_queue'
AGENT_STAGING_OUTPUT_QUEUE = 'agent_staging_output_queue'
+AGENT_COLLECTING_QUEUE = 'agent_collecting_queue'
RAPTOR_SCHEDULING_QUEUE = 'raptor_scheduling_queue'
@@ -45,6 +45,10 @@
STATE_PUBSUB = 'state_pubsub'
LOG_PUBSUB = 'log_pubsub'
+PROXY_CONTROL_PUBSUB = 'proxy_control_pubsub'
+PROXY_STATE_PUBSUB = 'proxy_state_pubsub'
+PROXY_TASK_QUEUE = 'proxy_task_queue'
+
# ------------------------------------------------------------------------------
#
diff --git a/src/radical/pilot/db/database.py b/src/radical/pilot/db/database.py
index 7f27f0b12b..81f833ed1d 100644
--- a/src/radical/pilot/db/database.py
+++ b/src/radical/pilot/db/database.py
@@ -18,7 +18,7 @@ class DBSession(object):
# --------------------------------------------------------------------------
#
- def __init__(self, sid, dburl, cfg, log, connect=True):
+ def __init__(self, sid, dburl, log, connect=True):
'''
Creates a new session
@@ -32,15 +32,16 @@ def __init__(self, sid, dburl, cfg, log, connect=True):
tasks : document describing a rp.Task
'''
- self._dburl = dburl
- self._log = log
- self._mongo = None
- self._db = None
- self._created = time.time()
- self._connected = None
- self._closed = None
- self._c = None
- self._can_remove = False
+ self._dburl = dburl
+ self._log = log
+ self._mongo = None
+ self._db = None
+ self._created = time.time()
+ self._connected = None
+ self._reconnected = None
+ self._closed = None
+ self._c = None
+ self._can_remove = False
if not connect:
return
@@ -72,10 +73,10 @@ def __init__(self, sid, dburl, cfg, log, connect=True):
self._c.insert({'type' : 'session',
'_id' : sid,
'uid' : sid,
- 'cfg' : cfg.as_dict(),
'created' : self._created,
'connected' : self._connected})
- self._can_remove = True
+ self._can_remove = True
+ self._reconnected = False
else:
docs = self._c.find({'type' : 'session',
@@ -84,9 +85,10 @@ def __init__(self, sid, dburl, cfg, log, connect=True):
raise ValueError('cannot reconnect to session %s' % sid)
doc = docs[0]
- self._can_delete = False
- self._created = doc['created']
- self._connected = time.time()
+ self._can_delete = False
+ self._created = doc['created']
+ self._connected = time.time()
+ self._reconnected = True
# FIXME: get bridge addresses from DB? If not, from where?
@@ -130,6 +132,16 @@ def connected(self):
return self._connected
+ # --------------------------------------------------------------------------
+ #
+ @property
+ def reconnected(self):
+ '''
+ Returns boolean indicating if the session was reconnected (vs. created)
+ '''
+ return self._reconnected
+
+
# --------------------------------------------------------------------------
#
@property
diff --git a/src/radical/pilot/messages.py b/src/radical/pilot/messages.py
new file mode 100644
index 0000000000..2cd141c55b
--- /dev/null
+++ b/src/radical/pilot/messages.py
@@ -0,0 +1,97 @@
+
+
+from typing import Any
+
+import radical.utils as ru
+
+# ------------------------------------------------------------------------------
+#
+class RPBaseMessage(ru.Message):
+
+ # rpc distinguishes messages which are forwarded to the proxy bridge and
+ # those which are not and thus remain local to the module they originate in.
+
+ _schema = {'fwd' : bool}
+ _defaults = {'_msg_type': 'rp_msg',
+ 'fwd' : False}
+
+
+ # we do not register this message type - it is not supposed to be used
+ # directly.
+
+
+# ------------------------------------------------------------------------------
+#
+class HeartbeatMessage(RPBaseMessage):
+
+ # heartbeat messages are never forwarded
+
+ _schema = {'uid' : str}
+ _defaults = {'_msg_type': 'heartbeat',
+ 'fwd' : False,
+ 'uid' : None}
+
+
+ru.Message.register_msg_type('heartbeat', HeartbeatMessage)
+
+
+# ------------------------------------------------------------------------------
+#
+class RPCRequestMessage(RPBaseMessage):
+
+ _schema = {'uid' : str, # uid of message
+ 'addr' : str, # who is expected to act on the request
+ 'cmd' : str, # rpc command
+ 'args' : list, # rpc command arguments
+ 'kwargs' : dict} # rpc command named arguments
+ _defaults = {
+ '_msg_type': 'rpc_req',
+ 'fwd' : True,
+ 'uid' : None,
+ 'addr' : None,
+ 'cmd' : None,
+ 'args' : [],
+ 'kwargs' : {}}
+
+
+
+ru.Message.register_msg_type('rpc_req', RPCRequestMessage)
+
+
+# ------------------------------------------------------------------------------
+#
+class RPCResultMessage(RPBaseMessage):
+
+ _schema = {'uid' : str, # uid of rpc call
+ 'val' : Any, # return value (`None` by default)
+ 'out' : str, # stdout
+ 'err' : str, # stderr
+ 'exc' : str} # raised exception representation
+ _defaults = {'_msg_type': 'rpc_res',
+ 'fwd' : True,
+ 'uid' : None,
+ 'val' : None,
+ 'out' : None,
+ 'err' : None,
+ 'exc' : None}
+
+ # --------------------------------------------------------------------------
+ #
+ def __init__(self, rpc_req=None, from_dict=None, **kwargs):
+
+ # when constructed from a request message copy the uid
+
+ if rpc_req:
+ if not from_dict:
+ from_dict = dict()
+
+ from_dict['uid'] = rpc_req['uid']
+
+ super().__init__(from_dict, **kwargs)
+
+
+ru.Message.register_msg_type('rpc_res', RPCResultMessage)
+
+
+# ------------------------------------------------------------------------------
+
diff --git a/src/radical/pilot/pilot.py b/src/radical/pilot/pilot.py
index f38543e7f6..13f9561d32 100644
--- a/src/radical/pilot/pilot.py
+++ b/src/radical/pilot/pilot.py
@@ -5,6 +5,9 @@
import copy
import time
+import queue
+
+import threading as mt
import radical.utils as ru
@@ -12,6 +15,7 @@
from . import states as rps
from . import constants as rpc
+from .messages import RPCRequestMessage, RPCResultMessage
from .staging_directives import complete_url
@@ -155,6 +159,18 @@ def __init__(self, pmgr: PilotManager, descr):
self._session_sandbox .path = self._session_sandbox .path % expand
self._pilot_sandbox .path = self._pilot_sandbox .path % expand
+ # hook into the control pubsub for rpc handling
+ self._rpc_reqs = dict()
+ ctrl_addr_sub = self._session._reg['bridges.control_pubsub.addr_sub']
+ ctrl_addr_pub = self._session._reg['bridges.control_pubsub.addr_pub']
+
+ self._ctrl_pub = ru.zmq.Publisher(rpc.CONTROL_PUBSUB, url=ctrl_addr_pub,
+ log=self._log, prof=self._prof)
+
+ ru.zmq.Subscriber(rpc.CONTROL_PUBSUB, url=ctrl_addr_sub,
+ log=self._log, prof=self._prof, cb=self._control_cb,
+ topic=rpc.CONTROL_PUBSUB)
+
# --------------------------------------------------------------------------
#
@@ -226,7 +242,14 @@ def _update(self, pilot_dict):
self._state = target
# keep all information around
- self._pilot_dict = copy.deepcopy(pilot_dict)
+ ru.dict_merge(self._pilot_dict, pilot_dict, ru.OVERWRITE)
+
+ # FIXME MONGODB
+ resources = self._pilot_dict.get('resources') or {}
+ rm_info = resources.get('rm_info')
+ if rm_info:
+ del self._pilot_dict['resources']['rm_info']
+ self._pilot_dict['resource_details'] = rm_info
# invoke pilot specific callbacks
# FIXME: this iteration needs to be thread-locked!
@@ -265,6 +288,7 @@ def as_dict(self):
'stdout' : self.stdout,
'stderr' : self.stderr,
'resource' : self.resource,
+ 'resources' : self.resources,
'endpoint_fs' : str(self._endpoint_fs),
'resource_sandbox' : str(self._resource_sandbox),
'session_sandbox' : str(self._session_sandbox),
@@ -379,6 +403,15 @@ def resource(self):
return self._descr.get('resource')
+ # --------------------------------------------------------------------------
+ #
+ @property
+ def resources(self):
+ """str: The amount of resources used by this pilot."""
+
+ return self._pilot_dict.get('resources')
+
+
# --------------------------------------------------------------------------
#
@property
@@ -564,6 +597,15 @@ def wait(self, state=None, timeout=None):
def cancel(self):
"""Cancel the pilot."""
+ self._finalize()
+
+ self._pmgr.cancel_pilots(self._uid)
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _finalize(self):
+
# clean connection cache
try:
for key in self._cache:
@@ -573,8 +615,6 @@ def cancel(self):
except:
pass
- self._pmgr.cancel_pilots(self.uid)
-
# --------------------------------------------------------------------------
#
@@ -664,22 +704,7 @@ def prepare_env(self, env_name, env_spec):
"""
- self.rpc('prepare_env', {'env_name': env_name,
- 'env_spec': env_spec})
-
-
- # --------------------------------------------------------------------------
- #
- def rpc(self, rpc, args):
- """Remote procedure call.
-
- Send a pilot command, wait for the response, and return the result.
- This is basically an RPC into the pilot.
- """
-
- reply = self._session._dbs.pilot_rpc(self.uid, self.uid, rpc, args)
-
- return reply
+ self.rpc('prepare_env', env_name=env_name, env_spec=env_spec)
# --------------------------------------------------------------------------
@@ -708,6 +733,68 @@ def stage_in(self, sds):
return [sd['target'] for sd in sds]
+ # --------------------------------------------------------------------------
+ #
+ def _control_cb(self, topic, msg_data):
+
+ # we only listen for RPCResponse messages
+
+ try:
+ msg = ru.zmq.Message.deserialize(msg_data)
+
+ if isinstance(msg, RPCResultMessage):
+
+ self._log.debug_4('handle rpc result %s', msg)
+
+ if msg.uid in self._rpc_reqs:
+ self._rpc_reqs[msg.uid]['res'] = msg
+ self._rpc_reqs[msg.uid]['evt'].set()
+
+ except:
+ pass
+
+
+ # --------------------------------------------------------------------------
+ #
+ def rpc(self, cmd, *args, **kwargs):
+ '''Remote procedure call.
+
+ Send am RPC command and arguments to the pilot and wait for the
+ response. This is a synchronous operation at this point, and it is not
+ thread safe to have multiple concurrent RPC calls.
+ '''
+
+ # RPC's can only be handled in `PMGR_ACTIVE` state
+ # FIXME: RPCs will hang vorever if the pilot dies after sending the msg
+ self.wait(rps.PMGR_ACTIVE)
+
+ rpc_id = ru.generate_id('%s.rpc' % self._uid)
+ rpc_req = RPCRequestMessage(uid=rpc_id, cmd=cmd, args=args,
+ kwargs=kwargs, addr=self.uid)
+
+ self._rpc_reqs[rpc_id] = {
+ 'req': rpc_req,
+ 'res': None,
+ 'evt': mt.Event(),
+ 'time': time.time(),
+ }
+
+ self._ctrl_pub.put(rpc.CONTROL_PUBSUB, rpc_req)
+
+ while True:
+
+ if not self._rpc_reqs[rpc_id]['evt'].wait(timeout=60):
+ self._log.debug('still waiting for rpc request %s', rpc_id)
+ continue
+
+ rpc_res = self._rpc_reqs[rpc_id]['res']
+
+ if rpc_res.exc:
+ raise RuntimeError('rpc failed: %s' % rpc_res.exc)
+
+ return rpc_res.val
+
+
# --------------------------------------------------------------------------
#
def stage_out(self, sds=None):
diff --git a/src/radical/pilot/pilot_manager.py b/src/radical/pilot/pilot_manager.py
index 2ea51e63d5..63c1428616 100644
--- a/src/radical/pilot/pilot_manager.py
+++ b/src/radical/pilot/pilot_manager.py
@@ -38,7 +38,7 @@ class PilotManager(rpu.Component):
Example::
- s = rp.Session(database_url=DBURL)
+ s = rp.Session()
pm = rp.PilotManager(session=s)
@@ -75,7 +75,7 @@ class PilotManager(rpu.Component):
# --------------------------------------------------------------------------
#
- def __init__(self, session, uid=None, cfg='default'):
+ def __init__(self, session, cfg='default'):
"""Creates a new PilotManager and attaches is to the session.
Arguments:
@@ -88,16 +88,11 @@ def __init__(self, session, uid=None, cfg='default'):
"""
- assert session.primary, 'pmgr needs primary session'
+ assert session._role == session._PRIMARY, 'pmgr needs primary session'
# initialize the base class (with no intent to fork)
- if uid:
- self._reconnect = True
- self._uid = uid
- else:
- self._reconnect = False
- self._uid = ru.generate_id('pmgr.%(item_counter)04d',
- ru.ID_CUSTOM, ns=session.uid)
+ self._uid = ru.generate_id('pmgr.%(item_counter)04d',
+ ru.ID_CUSTOM, ns=session.uid)
self._uids = list() # known UIDs
self._pilots = dict()
@@ -124,7 +119,7 @@ def __init__(self, session, uid=None, cfg='default'):
cfg.owner = self._uid
cfg.sid = session.uid
cfg.path = session.path
- cfg.dburl = session.dburl
+ cfg.reg_addr = session.reg_addr
cfg.heartbeat = session.cfg.heartbeat
cfg.client_sandbox = session._get_client_sandbox()
@@ -132,18 +127,16 @@ def __init__(self, session, uid=None, cfg='default'):
self.start()
self._log.info('started pmgr %s', self._uid)
+
+ self._rep = self._session._get_reporter(name=self._uid)
self._rep.info('<>ok\n')
@@ -244,6 +239,17 @@ def close(self, terminate=True):
self._rep.ok('>>ok\n')
+ # dump json
+ json = self.as_dict()
+ # json['_id'] = self.uid
+ json['type'] = 'pmgr'
+ json['uid'] = self.uid
+ json['pilots'] = [pilot.as_dict() for pilot in self._pilots.values()]
+
+ tgt = '%s/%s.json' % (self._session.path, self.uid)
+ ru.write_json(json, tgt)
+
+
# --------------------------------------------------------------------------
#
def as_dict(self):
@@ -277,33 +283,6 @@ def _pilot_heartbeat_cb(self):
return True
- # --------------------------------------------------------------------------
- #
- def _state_pull_cb(self):
-
- if self._terminate.is_set():
- return False
-
- # pull all pilot states from the DB, and compare to the states we know
- # about. If any state changed, update the known pilot instances and
- # push an update message to the state pubsub.
- # pubsub.
- # FIXME: we also pull for dead pilots. That is not efficient...
- # FIXME: this needs to be converted into a tailed cursor in the update
- # worker
- # FIXME: this is a big and frequently invoked lock
- pilot_dicts = self._session._dbs.get_pilots(pmgr_uid=self.uid)
-
-
- for pilot_dict in pilot_dicts:
- self._log.debug('state pulled: %s: %s', pilot_dict['uid'],
- pilot_dict['state'])
- if not self._update_pilot(pilot_dict, publish=True):
- return False
-
- return True
-
-
# --------------------------------------------------------------------------
#
def _state_sub_cb(self, topic, msg):
@@ -312,7 +291,7 @@ def _state_sub_cb(self, topic, msg):
return False
- self._log.debug('state event: %s', msg)
+ # self._log.debug('state event: %s', msg)
cmd = msg.get('cmd')
arg = msg.get('arg')
@@ -328,17 +307,37 @@ def _state_sub_cb(self, topic, msg):
if 'type' in thing and thing['type'] == 'pilot':
- self._log.debug('state push: %s: %s', thing['uid'],
- thing['state'])
+ self._log.debug('state push: %s: %s %s', thing['uid'],
+ thing['state'], thing.get('resources'))
# we got the state update from the state callback - don't
# publish it again
- if not self._update_pilot(thing, publish=False):
- return False
+ self._update_pilot(thing, publish=False)
return True
+ # --------------------------------------------------------------------------
+ #
+ def control_cb(self, topic, msg):
+
+ if self._terminate.is_set():
+ return False
+
+ cmd = msg['cmd']
+ arg = msg['arg']
+
+ self._log.debug_9('got control cmd %s: %s', cmd, arg)
+
+ if cmd == 'pilot_activate':
+ pilot = arg['pilot']
+ self._update_pilot(pilot, publish=True)
+
+ # store resource json for RA
+ fname = '%s/%s.resources.json' % (self._cfg.path, pilot['uid'])
+ ru.write_json(fname, pilot['resources'])
+
+
# --------------------------------------------------------------------------
#
def _update_pilot(self, pilot_dict, publish=False, advance=True):
@@ -352,25 +351,25 @@ def _update_pilot(self, pilot_dict, publish=False, advance=True):
# we don't care about pilots we don't know
if pid not in self._pilots:
- return True # this is not an error
+ return # this is not an error
# only update on state changes
current = self._pilots[pid].state
target = pilot_dict['state']
if current == target:
- return True
+ return
target, passed = rps._pilot_state_progress(pid, current, target)
- # print '%s current: %s' % (pid, current)
- # print '%s target : %s' % (pid, target )
- # print '%s passed : %s' % (pid, passed )
+ # self._log.debug('%s current: %s', pid, current)
+ # self._log.debug('%s target : %s', pid, target )
+ # self._log.debug('%s passed : %s', pid, passed )
if target in [rps.CANCELED, rps.FAILED]:
# don't replay intermediate states
passed = passed[-1:]
for s in passed:
- # print '%s advance: %s' % (pid, s )
+ self._log.debug('%s advance: %s', pid, s )
# we got state from either pubsub or DB, so don't publish again.
# we also don't need to maintain bulks for that reason.
pilot_dict['state'] = s
@@ -384,8 +383,6 @@ def _update_pilot(self, pilot_dict, publish=False, advance=True):
pilot_dict.get('lm_info'),
pilot_dict.get('lm_detail'))
- return True
-
# --------------------------------------------------------------------------
#
@@ -416,7 +413,8 @@ def _call_pilot_callbacks(self, pilot):
#
def _pilot_send_hb(self, pid=None):
- self._session._dbs.pilot_command('heartbeat', {'pmgr': self._uid}, pid)
+ self.publish(rpc.CONTROL_PUBSUB, {'cmd' : 'pmgr_heartbeat',
+ 'arg' : {'pmgr' : self.uid}})
# --------------------------------------------------------------------------
@@ -611,9 +609,6 @@ def submit_pilots(self, descriptions):
# only trigger the profile entry for NEW.
self.advance(pilot_docs, state=rps.NEW, publish=False, push=False)
- # insert pilots into the database, as a bulk.
- self._session._dbs.insert_pilots(pilot_docs)
-
# immediately send first heartbeat
for pilot_doc in pilot_docs:
pid = pilot_doc['uid']
@@ -647,15 +642,16 @@ def _reconnect_pilots(self):
# self.is_valid()
- pilot_docs = self._session._dbs.get_pilots(pmgr_uid=self.uid)
+ # FIXME MONGODB
+ # pilot_docs = self._session._dbs.get_pilots(pmgr_uid=self.uid)
- with self._pilots_lock:
- for ud in pilot_docs:
+ # with self._pilots_lock:
+ # for ud in pilot_docs:
- descr = PilotDescription(ud['description'])
- pilot = Pilot(pmgr=self, descr=descr)
+ # descr = PilotDescription(ud['description'])
+ # pilot = Pilot(pmgr=self, descr=descr)
- self._pilots[pilot.uid] = pilot
+ # self._pilots[pilot.uid] = pilot
# --------------------------------------------------------------------------
@@ -804,12 +800,14 @@ def _fail_missing_pilots(self):
cancellation command in due time, if they can
"""
- with self._pilots_lock:
- for pid in self._pilots:
- pilot = self._pilots[pid]
- if pilot.state not in rps.FINAL:
- self.advance(pilot.as_dict(), rps.FAILED,
- publish=True, push=False)
+ pass
+
+ # with self._pilots_lock:
+ # for pid in self._pilots:
+ # pilot = self._pilots[pid]
+ # if pilot.state not in rps.FINAL:
+ # self.advance(pilot.as_dict(), rps.FAILED,
+ # publish=True, push=False)
# --------------------------------------------------------------------------
@@ -829,21 +827,25 @@ def cancel_pilots(self, uids=None, _timeout=None):
if not isinstance(uids, list):
uids = [uids]
- with self._pilots_lock:
- for uid in uids:
- if uid not in self._pilots:
- raise ValueError('pilot %s not known' % uid)
-
self._log.debug('pilot(s).need(s) cancellation %s', uids)
# send the cancellation request to the pilots
- # FIXME: the cancellation request should not go directly to the DB, but
- # through the DB abstraction layer...
- self._session._dbs.pilot_command('cancel_pilot', [], uids)
-
+ # FIXME: MongoDB
+ # self._session._dbs.pilot_command('cancel_pilot', [], uids)
+ self._log.debug('issue cancel_pilots for %s', uids)
+ self.publish(rpc.CONTROL_PUBSUB, {'cmd' : 'cancel_pilots',
+ 'arg' : {'pmgr' : self.uid,
+ 'uids' : uids}})
# wait for the cancel to be enacted
self.wait_pilots(uids=uids, timeout=_timeout)
+ # FIXME: only finalize pilots which actually terminated
+ with self._pilots_lock:
+ for uid in uids:
+ if uid not in self._pilots:
+ raise ValueError('pilot %s not known' % uid)
+ self._pilots[uid]._finalize()
+
# --------------------------------------------------------------------------
#
diff --git a/src/radical/pilot/pmgr/launching/base.py b/src/radical/pilot/pmgr/launching/base.py
index 226550cfed..96ad355c38 100644
--- a/src/radical/pilot/pmgr/launching/base.py
+++ b/src/radical/pilot/pmgr/launching/base.py
@@ -134,9 +134,6 @@ def __init__(self, cfg, session):
self._stager_queue = self.get_output_ep(rpc.STAGER_REQUEST_QUEUE)
- # we listen for pilot cancel commands
- self.register_subscriber(rpc.CONTROL_PUBSUB, self._pmgr_control_cb)
-
# also listen for completed staging directives
self.register_subscriber(rpc.STAGER_RESPONSE_PUBSUB, self._staging_ack_cb)
self._active_sds = dict()
@@ -220,12 +217,12 @@ def finalize(self):
# --------------------------------------------------------------------------
#
- def _pmgr_control_cb(self, topic, msg):
+ def control_cb(self, topic, msg):
cmd = msg['cmd']
arg = msg['arg']
- self._log.debug('launcher got %s', msg)
+ self._log.debug_9('launcher got %s', msg)
if cmd == 'kill_pilots':
@@ -242,8 +239,6 @@ def _pmgr_control_cb(self, topic, msg):
self._kill_pilots(pids)
- return True
-
# --------------------------------------------------------------------------
#
@@ -308,13 +303,6 @@ def work(self, pilots):
self._start_pilot_bulk(resource, schema, pilots)
- # Update the Pilots' state to 'PMGR_ACTIVE_PENDING' if job
- # submission was successful. Since the pilot leaves the
- # scope of the PMGR for the time being, we update the
- # complete DB document
- for pilot in pilots:
- pilot['$all'] = True
-
self.advance(pilots, rps.PMGR_ACTIVE_PENDING,
push=False, publish=True)
@@ -431,8 +419,13 @@ def _start_pilot_bulk(self, resource, schema, pilots):
for fname in ru.as_list(pilot['description'].get('input_staging')):
base = os.path.basename(fname)
# checking if input staging file exists
+ if fname.startswith('./'):
+ fname = fname.split('./', maxsplit=1)[1]
+ if not fname.startswith('/'):
+ fname = os.path.join(self._cfg.base, fname)
if not os.path.exists(fname):
- raise RuntimeError('input_staging file does not exists: %s for pilot %s' % fname, pid)
+ raise RuntimeError('input_staging file does not exists: '
+ '%s for pilot %s' % (fname, pid))
ft_list.append({'src': fname,
'tgt': '%s/%s' % (pid, base),
@@ -473,6 +466,7 @@ def _start_pilot_bulk(self, resource, schema, pilots):
cmd = 'ln -s %s %s/%s' % (os.path.abspath(src), tmp_dir, tgt)
out, err, ret = ru.sh_callout(cmd, shell=True)
if ret:
+ self._log.debug('cmd: %s', cmd)
self._log.debug('out: %s', out)
self._log.debug('err: %s', err)
raise RuntimeError('callout failed: %s' % cmd)
@@ -483,6 +477,7 @@ def _start_pilot_bulk(self, resource, schema, pilots):
out, err, ret = ru.sh_callout(cmd, shell=True)
if ret:
+ self._log.debug('cmd: %s', cmd)
self._log.debug('out: %s', out)
self._log.debug('err: %s', err)
raise RuntimeError('callout failed: %s' % cmd)
@@ -547,8 +542,8 @@ def _prepare_pilot(self, resource, rcfg, pilot, expand, tar_name):
# ----------------------------------------------------------------------
# Database connection parameters
- sid = self._session.uid
- database_url = self._session.cfg.dburl
+ sid = self._session.uid
+ proxy_url = self._session.cfg.proxy_url
# some default values are determined at runtime
default_virtenv = '%%(resource_sandbox)s/ve.%s.%s' % \
@@ -571,7 +566,7 @@ def _prepare_pilot(self, resource, rcfg, pilot, expand, tar_name):
# ----------------------------------------------------------------------
# get parameters from resource cfg, set defaults where needed
- agent_dburl = rcfg.get('agent_mongodb_endpoint', database_url)
+ agent_proxy_url = rcfg.get('agent_proxy_url', proxy_url)
agent_spawner = rcfg.get('agent_spawner', DEFAULT_AGENT_SPAWNER)
agent_config = rcfg.get('agent_config', DEFAULT_AGENT_CONFIG)
agent_scheduler = rcfg.get('agent_scheduler')
@@ -662,11 +657,11 @@ def _prepare_pilot(self, resource, rcfg, pilot, expand, tar_name):
raise RuntimeError("'global_virtenv' is deprecated (%s)" % resource)
# Create a host:port string for use by the bootstrap_0.
- db_url = ru.Url(agent_dburl)
- if db_url.port:
- db_hostport = "%s:%d" % (db_url.host, db_url.port)
+ tmp = ru.Url(agent_proxy_url)
+ if tmp.port:
+ hostport = "%s:%d" % (tmp.host, tmp.port)
else:
- db_hostport = "%s:%d" % (db_url.host, 27017) # mongodb default
+ raise RuntimeError('service URL needs port number: %s' % tmp)
# ----------------------------------------------------------------------
# the version of the agent is derived from
@@ -853,7 +848,7 @@ def _prepare_pilot(self, resource, rcfg, pilot, expand, tar_name):
# set optional args
if resource_manager == "CCM": bs_args.extend(['-c'])
if forward_tunnel_endpoint: bs_args.extend(['-f', forward_tunnel_endpoint])
- if forward_tunnel_endpoint: bs_args.extend(['-h', db_hostport])
+ if forward_tunnel_endpoint: bs_args.extend(['-h', hostport])
if python_interpreter: bs_args.extend(['-i', python_interpreter])
if tunnel_bind_device: bs_args.extend(['-t', tunnel_bind_device])
if cleanup: bs_args.extend(['-x', cleanup])
@@ -861,7 +856,11 @@ def _prepare_pilot(self, resource, rcfg, pilot, expand, tar_name):
for arg in pre_bootstrap_0: bs_args.extend(['-e', arg])
for arg in pre_bootstrap_1: bs_args.extend(['-w', arg])
- agent_cfg['owner'] = 'agent.0'
+ agent_cfg['uid'] = 'agent_0'
+ agent_cfg['sid'] = sid
+ agent_cfg['pid'] = pid
+ agent_cfg['owner'] = pid
+ agent_cfg['pmgr'] = self._pmgr
agent_cfg['resource'] = resource
agent_cfg['nodes'] = requested_nodes
agent_cfg['cores'] = allocated_cores
@@ -870,11 +869,7 @@ def _prepare_pilot(self, resource, rcfg, pilot, expand, tar_name):
agent_cfg['scheduler'] = agent_scheduler
agent_cfg['runtime'] = runtime
agent_cfg['app_comm'] = app_comm
- agent_cfg['dburl'] = str(database_url)
- agent_cfg['sid'] = sid
- agent_cfg['pid'] = pid
- agent_cfg['pmgr'] = self._pmgr
- agent_cfg['logdir'] = '.'
+ agent_cfg['proxy_url'] = agent_proxy_url
agent_cfg['pilot_sandbox'] = pilot_sandbox
agent_cfg['session_sandbox'] = session_sandbox
agent_cfg['resource_sandbox'] = resource_sandbox
@@ -889,20 +884,19 @@ def _prepare_pilot(self, resource, rcfg, pilot, expand, tar_name):
agent_cfg['task_post_launch'] = task_post_launch
agent_cfg['task_post_exec'] = task_post_exec
agent_cfg['resource_cfg'] = copy.deepcopy(rcfg)
- agent_cfg['debug'] = self._log.getEffectiveLevel()
+ agent_cfg['log_lvl'] = self._log.level
+ agent_cfg['debug_lvl'] = self._log.debug_level
agent_cfg['services'] = services
- # we'll also push the agent config into MongoDB
pilot['cfg'] = agent_cfg
pilot['resources'] = {'cpu': allocated_cores,
'gpu': allocated_gpus}
- pilot['$set'] = ['resources']
# ----------------------------------------------------------------------
# Write agent config dict to a json file in pilot sandbox.
- agent_cfg_name = 'agent.0.cfg'
+ agent_cfg_name = 'agent_0.cfg'
cfg_tmp_handle, cfg_tmp_file = tempfile.mkstemp(prefix='rp.agent_cfg.')
os.close(cfg_tmp_handle) # file exists now
diff --git a/src/radical/pilot/pmgr/launching/saga.py b/src/radical/pilot/pmgr/launching/saga.py
index 039dc86c8f..9faf852639 100644
--- a/src/radical/pilot/pmgr/launching/saga.py
+++ b/src/radical/pilot/pmgr/launching/saga.py
@@ -67,6 +67,8 @@ def _translate_state(self, saga_state):
#
def _job_state_cb(self, job, _, saga_state, pid):
+ self._log.debug('job state: %s %s %s', pid, saga_state, job.id)
+
try:
with self._lock:
@@ -206,6 +208,11 @@ def kill_pilots(self, pids):
self._log.debug('cancellation start')
tc.cancel()
tc.wait()
+
+ for pid in pids:
+ pilot = self._pilots[pid]
+ self._state_cb(pilot, rps.CANCELED)
+
self._log.debug('cancellation done')
diff --git a/src/radical/pilot/proxy.py b/src/radical/pilot/proxy.py
new file mode 100644
index 0000000000..88c028934d
--- /dev/null
+++ b/src/radical/pilot/proxy.py
@@ -0,0 +1,345 @@
+
+import sys
+import time
+import queue
+
+import threading as mt
+import multiprocessing as mp
+import radical.utils as ru
+
+
+_TIMEOUT = 300 # time to keep the bridge alive
+_LINGER_TIMEOUT = 250 # ms to linger after close
+_HIGH_WATER_MARK = 0 # number of messages to buffer before dropping
+ # 0: infinite
+
+
+# ------------------------------------------------------------------------------
+# This ZMQ bridge links clients and agents, and bridges network gaps. As such
+# it needs to run on a resource which has a public IP address that can be
+# reached from both the client and the server machine.
+#
+# The bridge listens on a `REP` socket (`bridge_request`) for incoming client or
+# agent connections, identified by a common session ID. A client connection
+# will trigger the creation of the following communication channels:
+#
+# - proxy_control_pubsub_bridge
+# links client and agent control pubsubs (includes heartbeat)
+# - proxy_state_pubsub_bridge
+# forwards task state updates from agents to client
+# - proxy_task_queue
+# forwards tasks from the client to the agents and vice versa
+#
+#
+# The protocol on the `bridge_request` channel is as follows:
+#
+# register
+# --------
+#
+# request:
+# 'cmd': 'register'
+# 'arg': 'sid':
+#
+# reply:
+# 'res': {'proxy_control_pubsub': {'sub': , 'pub': },
+# 'proxy_state_pubsub' : {'sub': , 'pub': },
+# 'proxy_task_queue' : {'put': , 'get': }}
+#
+# notes:
+# - the request will fail if the session ID is known from another
+# `register` call
+# 'err': 'sid already connected'
+# - this request should otherwise always succeed
+# - the created pubsub channels will be terminated if the control channel
+# has not seen a client heartbeat for <10 * heartbeat_interval> seconds
+# - see semantics of the 'unregister' request for details.
+# - the same termination semantics holds for the 'unregister'
+# request.
+# - any task queues which exist for that session at the time of
+# termination will also be closed, disregarding any data held in those
+# queues.
+#
+#
+# lookup
+# ------
+#
+# request:
+# 'cmd': 'lookup'
+# 'arg': 'sid':
+#
+# reply:
+# 'res': {'proxy_control_pubsub': {'sub': , 'pub': },
+# 'proxy_state_pubsub' : {'sub': , 'pub': },
+# 'proxy_task_queue' : {'put': , 'get': }}
+#
+# notes:
+# - the request will fail if the session ID is not registered (anymore)
+# - this request should otherwise always succeed
+# - the call returns the same information as `register`, but does
+# not alter the state of the client's bridge in any other way.
+# - the request does not count as a heartbeat
+#
+#
+# unregister
+# ----------
+#
+# request:
+# 'cmd': 'unregister'
+# 'arg': 'sid':
+#
+# reply:
+# 'res': 'ok'
+#
+# - this method only fails when the session is not connected, with
+# 'err': 'session not connected'
+# - in all other cases, the request will cause the immediate termination of
+# all ZMQ bridges (pubsubs and queues) previously created for that
+# session, disregarding of their state, and disposing all undelivered
+# messages still held in the bridges.
+#
+#
+# heartbeat
+# ---------
+#
+# request:
+# 'cmd': 'heartbeat'
+# 'arg': 'sid':
+#
+# reply:
+# 'res': {'time': }
+#
+# notes:
+# - this request will fail if the session is either not connected or timed
+# because of an earlier heartbeat failure:
+# 'err': 'session not connected'
+# - it will otherwise ensure the server that the client is still alive and
+# requires the bridge to be up. If the server does not receive a heartbeat
+# for longer than TIMEOUT seconds, the bridge will be terminated.
+#
+#
+# default error mode
+# ------------------
+#
+# To any request other than the above, the ZMQ bridge will respond:
+# 'err': 'invalid request'
+#
+# ------------------------------------------------------------------------------
+
+# ------------------------------------------------------------------------------
+#
+class Proxy(ru.zmq.Server):
+
+ def __init__(self, path=None):
+
+ self._lock = mt.Lock()
+ self._clients = dict()
+
+ ru.zmq.Server.__init__(self, url='tcp://*:10000+', path=path)
+
+ self._monitor_thread = mt.Thread(target=self._monitor)
+ self._monitor_thread.daemon = True
+ self._monitor_thread.start()
+
+ self.register_request('register', self._register)
+ self.register_request('lookup', self._lookup)
+ self.register_request('unregister', self._unregister)
+ self.register_request('heartbeat', self._heartbeat)
+ self.register_request('service_stop', self._service_stop)
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _monitor(self):
+
+ # this is a daemon thread - it never exits until process termination
+ while True:
+
+ time.sleep(10)
+ now = time.time()
+
+ # iterate w/o lock, and thus get a snapshot of the known sids
+ sids = list(self._clients.keys())
+
+ to_terminate = list()
+ for sid in sids:
+
+ client = self._clients.get(sid)
+ if not client:
+ continue
+
+ if now > (client['hb'] + _TIMEOUT):
+ self._log.warn('client %s timed out' % sid)
+ to_terminate.append(sid)
+
+ if not to_terminate:
+ continue
+
+ with self._lock:
+
+ for sid in to_terminate:
+
+ client = self._clients.get(sid)
+ if not client:
+ continue
+
+ client['term'].set()
+ client['proc'].join()
+ del(self._clients[sid])
+
+
+ # --------------------------------------------------------------------------
+ #
+ def stop(self):
+
+ for sid in self._clients:
+ self._log.info('stop client %s' % sid)
+ self._clients[sid]['term'].set()
+
+ self._log.info('stop proxy service')
+ ru.zmq.Server.stop(self)
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _register(self, arg):
+
+ sid = arg['sid']
+
+ if sid in self._clients:
+ raise RuntimeError('client already registered')
+
+ q = mp.Queue()
+ term = mp.Event()
+ proc = mp.Process(target=self._worker, args=(sid, q, term))
+ proc.start()
+
+ try:
+ cfg = q.get(timeout=10)
+ except queue.Empty as e:
+ proc.terminate()
+ raise RuntimeError('worker startup failed') from e
+
+ self._clients[sid] = {'proc': proc,
+ 'term': term,
+ 'cfg' : cfg,
+ 'hb' : time.time()}
+
+ return self._clients[sid]['cfg']
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _worker(self, sid, q, term):
+
+ # FIXME: log level etc
+ log = ru.Logger('radical.pilot.bridge', level='debug', path=sid)
+
+ proxy_cp = None
+ proxy_sp = None
+ proxy_tq = None
+
+ try:
+ proxy_cp = ru.zmq.PubSub(channel='proxy_control_pubsub',
+ cfg={'uid' : 'proxy_control_pubsub',
+ 'type' : 'pubsub',
+ 'log_lvl': 'debug',
+ 'path' : sid})
+
+ proxy_sp = ru.zmq.PubSub(channel='proxy_state_pubsub',
+ cfg={'uid' : 'proxy_state_pubsub',
+ 'type' : 'pubsub',
+ 'log_lvl': 'debug',
+ 'path' : sid})
+
+ proxy_tq = ru.zmq.Queue (channel='proxy_task_queue',
+ cfg={'uid' : 'proxy_task_queue',
+ 'type' : 'queue',
+ 'log_lvl': 'debug',
+ 'path' : sid})
+
+ proxy_cp.start()
+ proxy_sp.start()
+ proxy_tq.start()
+
+ cfg = {'proxy_control_pubsub': {'addr_pub': str(proxy_cp.addr_pub),
+ 'addr_sub': str(proxy_cp.addr_sub)},
+ 'proxy_state_pubsub' : {'addr_pub': str(proxy_sp.addr_pub),
+ 'addr_sub': str(proxy_sp.addr_sub)},
+ 'proxy_task_queue' : {'addr_put': str(proxy_tq.addr_put),
+ 'addr_get': str(proxy_tq.addr_get)}}
+
+ # inform service about endpoint details
+ q.put(cfg)
+
+ # we run forever until we receive a termination command
+ log.info('work')
+ term.wait()
+
+
+ except:
+ log.exception('worker failed')
+
+ finally:
+
+ if proxy_cp: proxy_cp.stop()
+ if proxy_sp: proxy_sp.stop()
+ if proxy_tq: proxy_tq.stop()
+
+ log.info('terminated')
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _lookup(self, arg):
+
+ sid = arg['sid']
+
+ with self._lock:
+ if sid not in self._clients:
+ raise RuntimeError('client %s not registered' % sid)
+
+ return self._clients[sid]['cfg']
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _unregister(self, arg):
+
+ sid = arg['sid']
+
+ with self._lock:
+
+ if sid not in self._clients:
+ raise RuntimeError('client %s not registered' % sid)
+
+ self._clients[sid]['term'].set()
+ self._clients[sid]['proc'].join()
+
+ del(self._clients[sid])
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _heartbeat(self, arg):
+
+ sid = arg['sid']
+ now = time.time()
+
+ with self._lock:
+
+ if sid not in self._clients:
+ self._log.warn('client %s not in %s', sid, self._clients)
+ return
+
+ self._clients[sid]['hb'] = now
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _service_stop(self):
+
+ self.stop()
+
+
+# ------------------------------------------------------------------------------
+
diff --git a/src/radical/pilot/raptor/master.py b/src/radical/pilot/raptor/master.py
index b913e6f92f..0f9c789cc6 100644
--- a/src/radical/pilot/raptor/master.py
+++ b/src/radical/pilot/raptor/master.py
@@ -46,7 +46,7 @@ class Master(rpu.Component):
# --------------------------------------------------------------------------
#
- def __init__(self, cfg=None):
+ def __init__(self, cfg: ru.Config = None):
'''
This raptor master is expected to be hosted in a main thread of a RP
task instance. As such the normal `RP_*` environment variables are
@@ -59,6 +59,7 @@ def __init__(self, cfg=None):
cfg: session config. fallback: agent config
'''
+ self._raptor_cfg = cfg or ru.Config()
self._uid = os.environ['RP_TASK_ID']
self._pid = os.environ['RP_PILOT_ID']
self._sid = os.environ['RP_SESSION_ID']
@@ -67,6 +68,9 @@ def __init__(self, cfg=None):
self._psbox = os.environ['RP_PILOT_SANDBOX']
self._ssbox = os.environ['RP_SESSION_SANDBOX']
self._rsbox = os.environ['RP_RESOURCE_SANDBOX']
+ self._reg_addr = os.environ['RP_REGISTRY_ADDRESS']
+
+ self._reg = ru.zmq.RegistryClient(url=self._reg_addr)
self._workers = dict() # wid: worker
self._tasks = dict() # bookkeeping of submitted requests
@@ -77,20 +81,26 @@ def __init__(self, cfg=None):
self._hb_freq = 500 # check worker heartbetas every n seconds
self._hb_timeout = 1000 # consider worker dead after 150 seconds
- cfg = self._get_config(cfg)
- self._session = Session(cfg=cfg, uid=cfg.sid, _primary=False)
+ self._session = Session(uid=self._sid, _reg_addr=self._reg_addr,
+ _role=Session._DEFAULT)
self._rpc_handlers = dict()
self.register_rpc_handler('stop', self.stop)
- rpu.Component.__init__(self, cfg, self._session)
+ ccfg = ru.Config(from_dict={'uid' : self._uid,
+ 'sid' : self._sid,
+ 'owner' : self._pid,
+ 'reg_addr': self._reg_addr})
+
+ rpu.Component.__init__(self, ccfg, self._session)
- self.register_publisher(rpc.STATE_PUBSUB, self._psbox)
- self.register_publisher(rpc.CONTROL_PUBSUB, self._psbox)
+ # we never run `self.start()` which is ok - but it means we miss out on
+ # some of the component initialization. Call it manually thus
+ self._initialize()
# send new worker tasks and agent input staging / agent scheduler
self.register_output(rps.AGENT_STAGING_INPUT_PENDING,
- rpc.AGENT_STAGING_INPUT_QUEUE, self._psbox)
+ rpc.AGENT_STAGING_INPUT_QUEUE)
# set up zmq queues between the agent scheduler and this master so that
# we can receive new requests from RP tasks
@@ -102,12 +112,13 @@ def __init__(self, cfg=None):
'stall_hwm' : 0,
'bulk_size' : 1})
- self._input_queue = ru.zmq.Queue(input_cfg)
+ # FIXME: how to pass cfg?
+ self._input_queue = ru.zmq.Queue(qname, cfg=input_cfg)
self._input_queue.start()
# send completed request tasks to agent output staging / tmgr
self.register_output(rps.AGENT_STAGING_OUTPUT_PENDING,
- rpc.AGENT_STAGING_OUTPUT_QUEUE, self._psbox)
+ rpc.AGENT_STAGING_OUTPUT_QUEUE)
# set up zmq queues between this master and all workers for request
# distribution and result collection
@@ -125,8 +136,8 @@ def __init__(self, cfg=None):
'stall_hwm' : 0,
'bulk_size' : 1})
- self._req_queue = ru.zmq.Queue(req_cfg)
- self._res_queue = ru.zmq.Queue(res_cfg)
+ self._req_queue = ru.zmq.Queue('raptor_tasks', cfg=req_cfg)
+ self._res_queue = ru.zmq.Queue('raptor_results', cfg=res_cfg)
self._req_queue.start()
self._res_queue.start()
@@ -164,8 +175,7 @@ def __init__(self, cfg=None):
ru.zmq.Getter(qname, self._input_queue.addr_get, cb=self._request_cb)
# everything is set up - we can serve messages on the pubsubs also
- self.register_subscriber(rpc.STATE_PUBSUB, self._state_cb, self._psbox)
- self.register_subscriber(rpc.CONTROL_PUBSUB, self._control_cb, self._psbox)
+ self.register_subscriber(rpc.STATE_PUBSUB, self._state_cb)
# and register that input queue with the scheduler
self._log.debug('registered raptor queue: %s / %s', self._uid, qname)
@@ -192,41 +202,6 @@ def register_rpc_handler(self, cmd, handler) -> None:
self._rpc_handlers[cmd] = handler
- # --------------------------------------------------------------------------
- #
- def _get_config(self, cfg=None):
- '''
- derive a worker base configuration from the control pubsub configuration
-
- Args:
- cfg (Dict[str, Any]): configuration to start from
- '''
-
- # FIXME: use registry for comm EP info exchange, not cfg files
-
- if cfg is None:
- cfg = dict()
-
- if cfg and 'path' in cfg:
- del cfg['path']
-
- ru.dict_merge(cfg, ru.read_json('%s/control_pubsub.json' % self._psbox))
-
- del cfg['channel']
- del cfg['cmgr']
-
- cfg['log_lvl'] = 'warn'
- cfg['kind'] = 'master'
- cfg['sid'] = self._sid
- cfg['path'] = self._sbox
- cfg['base'] = os.environ['RP_PILOT_SANDBOX']
-
- cfg = ru.Config(cfg=cfg)
- cfg['uid'] = self._uid
-
- return cfg
-
-
# --------------------------------------------------------------------------
#
@property
@@ -239,7 +214,7 @@ def workers(self):
# --------------------------------------------------------------------------
#
- def _control_cb(self, topic, msg):
+ def control_cb(self, topic, msg):
'''
listen for `worker_register`, `worker_unregister`,
`worker_rank_heartbeat` and `rpc_req` messages.
@@ -300,6 +275,7 @@ def _control_cb(self, topic, msg):
self._workers[uid]['status'] = self.DONE
+ # FIXME RPC
elif cmd == 'rpc_req':
if arg['tgt'] != self._uid:
@@ -425,12 +401,10 @@ def submit_workers(self, descriptions: List[TaskDescription]
List[str]: list of uids for submitted worker tasks
'''
- # FIXME registry: use registry instead of config files
-
tasks = list()
for td in descriptions:
- if not td.mode == RAPTOR_WORKER:
+ if td.mode != RAPTOR_WORKER:
raise ValueError('unexpected task mode [%s]' % td.mode)
# sharing GPUs among multiple ranks not yet supported
@@ -445,7 +419,6 @@ def submit_workers(self, descriptions: List[TaskDescription]
if not td.get('uid'):
td.uid = '%s.%s' % (self.uid, ru.generate_id('worker',
ns=self.uid))
-
if not td.get('executable'):
td.executable = 'radical-pilot-raptor-worker'
@@ -460,6 +433,11 @@ def submit_workers(self, descriptions: List[TaskDescription]
# ensure that defaults and backward compatibility kick in
td.verify()
+ # the default worker needs it's own task description to derive the
+ # amount of available resources
+ self._reg['raptor.%s.cfg' % self._uid] = td.as_dict()
+ self._reg.dump('raptor_master')
+
# all workers run in the same sandbox as the master
task = dict()
diff --git a/src/radical/pilot/raptor/worker.py b/src/radical/pilot/raptor/worker.py
index fda1a99e99..e7242aefc0 100644
--- a/src/radical/pilot/raptor/worker.py
+++ b/src/radical/pilot/raptor/worker.py
@@ -33,31 +33,40 @@ def __init__(self, manager, rank, raptor_id):
self._rank = rank
self._raptor_id = raptor_id
self._reg_event = mt.Event()
+ self._reg_addr = os.environ['RP_REGISTRY_ADDRESS']
self._sbox = os.environ['RP_TASK_SANDBOX']
self._uid = os.environ['RP_TASK_ID']
+ self._sid = os.environ['RP_SESSION_ID']
self._ranks = int(os.environ['RP_RANKS'])
+ self._reg = ru.zmq.RegistryClient(url=self._reg_addr)
+
+ self._cfg = ru.Config(cfg=self._reg['cfg'])
+
self._log = ru.Logger(name=self._uid, ns='radical.pilot.worker',
- level='DEBUG', targets=['.'], path=self._sbox)
+ level=self._cfg.log_lvl,
+ debug=self._cfg.debug_lvl,
+ targets=self._cfg.log_tgt,
+ path=self._cfg.path)
self._prof = ru.Profiler(name='%s.%04d' % (self._uid, self._rank),
ns='radical.pilot.worker',
- path=self._sbox)
+ path=self._cfg.path)
# register for lifetime management messages on the control pubsub
psbox = os.environ['RP_PILOT_SANDBOX']
- state_cfg = ru.read_json('%s/%s.cfg' % (psbox, rpc.STATE_PUBSUB))
- ctrl_cfg = ru.read_json('%s/%s.cfg' % (psbox, rpc.CONTROL_PUBSUB))
+ state_cfg = self._reg['bridges.%s' % rpc.STATE_PUBSUB]
+ ctrl_cfg = self._reg['bridges.%s' % rpc.CONTROL_PUBSUB]
- ru.zmq.Subscriber(rpc.STATE_PUBSUB, url=state_cfg['sub'],
+ ru.zmq.Subscriber(rpc.STATE_PUBSUB, url=state_cfg['addr_sub'],
log=self._log, prof=self._prof, cb=self._state_cb,
topic=rpc.STATE_PUBSUB)
- ru.zmq.Subscriber(rpc.CONTROL_PUBSUB, url=ctrl_cfg['sub'],
+ ru.zmq.Subscriber(rpc.CONTROL_PUBSUB, url=ctrl_cfg['addr_sub'],
log=self._log, prof=self._prof, cb=self._control_cb,
topic=rpc.CONTROL_PUBSUB)
# we push hertbeat and registration messages on that pubsub also
self._ctrl_pub = ru.zmq.Publisher(rpc.CONTROL_PUBSUB,
- url=ctrl_cfg['pub'],
+ url=ctrl_cfg['addr_pub'],
log=self._log,
prof=self._prof)
# let ZMQ settle
@@ -137,15 +146,17 @@ def _hb_worker(self):
# --------------------------------------------------------------------------
#
- def _state_cb(self, topic, msg):
+ def _state_cb(self, topic, msgs):
- cmd = msg['cmd']
- arg = msg['arg']
+ for msg in ru.as_list(msgs):
+
+ cmd = msg['cmd']
+ arg = msg['arg']
- # general task state updates -- check if our master is affected
- if cmd == 'update':
+ if cmd != 'update':
+ continue
- for thing in ru.as_list(arg):
+ for thing in arg:
uid = thing['uid']
state = thing['state']
diff --git a/src/radical/pilot/raptor/worker_default.py b/src/radical/pilot/raptor/worker_default.py
index c4518bfb37..f102aa6e15 100644
--- a/src/radical/pilot/raptor/worker_default.py
+++ b/src/radical/pilot/raptor/worker_default.py
@@ -46,11 +46,13 @@ def __init__(self, raptor_id : str):
self._req_get = ru.zmq.Getter('request', self._req_addr_get,
cb=self._request_cb)
- self._descr = ru.read_json('%s.json' % self._uid)
+ # the master should have stored our own task description in the registry
+ self._reg.dump('raptor_worker')
+ self._descr = self._reg['raptor.%s.cfg' % self._uid]
# keep worker ID and rank
- self._n_cores = self._descr.get('cores_per_rank', 1)
- self._n_gpus = int(self._descr.get('gpus_per_rank', 0))
+ self._n_cores = int(os.environ.get('cores_per_rank', 1))
+ self._n_gpus = int(os.environ.get('gpus_per_rank', 0))
# We need to make sure to run only up to `gpn` tasks using a gpu
# within that pool, so need a separate counter for that.
diff --git a/src/radical/pilot/raptor_tasks.py b/src/radical/pilot/raptor_tasks.py
index 4610b27c66..731a8d8f89 100644
--- a/src/radical/pilot/raptor_tasks.py
+++ b/src/radical/pilot/raptor_tasks.py
@@ -99,9 +99,14 @@ def rpc(self, rpc: str,
if not self._pilot:
raise RuntimeError('not assoigned to a pilot yet, cannot run rpc')
- reply = self._session._dbs.pilot_rpc(self._pilot, self.uid, rpc, args)
+ cmd = 'raptor_rpc'
- return reply
+ if not args:
+ args = dict()
+
+ args['raptor_cmd'] = rpc
+
+ return self._tmgr.pilot_rpc(self._pilot, cmd, args)
# ------------------------------------------------------------------------------
diff --git a/src/radical/pilot/resource_description.py b/src/radical/pilot/resource_description.py
new file mode 100644
index 0000000000..8f36893229
--- /dev/null
+++ b/src/radical/pilot/resource_description.py
@@ -0,0 +1,148 @@
+
+__copyright__ = 'Copyright 2013-2021, The RADICAL-Cybertools Team'
+__license__ = 'MIT'
+
+import radical.utils as ru
+
+DESCRIPTION = 'description'
+NOTES = 'notes'
+DEFAULT_SCHEMA = 'default_schema'
+SCHEMAS = 'schemas'
+JOB_MANAGER_ENDPOINT = 'job_manager_endpoint'
+JOB_MANAGER_HOP = 'job_manager_hop'
+FILESYSTEM_ENDPOINT = 'filesystem_endpoint'
+DEFAULT_REMOTE_WORKDIR = 'default_remote_workdir'
+DEFAULT_QUEUE = 'default_queue'
+RESOURCE_MANAGER = 'resource_manager'
+AGENT_CONFIG = 'agent_config'
+AGENT_SCHEDULER = 'agent_scheduler'
+AGENT_SPAWNER = 'agent_spawner'
+PRE_BOOTSTRAP_0 = 'pre_bootstrap_0'
+PRE_BOOTSTRAP_1 = 'pre_bootstrap_1'
+RP_VERSION = 'rp_version'
+VIRTENV = 'virtenv'
+VIRTENV_MODE = 'virtenv_mode'
+VIRTENV_DIST = 'virtenv_dist'
+PYTHON_DIST = 'python_dist'
+LAUNCH_METHODS = 'launch_methods'
+LFS_PATH_PER_NODE = 'lfs_path_per_node'
+LFS_SIZE_PER_NODE = 'lfs_size_per_node'
+MEM_PER_NODE = 'mem_per_node'
+CORES_PER_NODE = 'cores_per_node'
+GPUS_PER_NODE = 'gpus_per_node'
+SYSTEM_ARCHITECTURE = 'system_architecture'
+
+FAKE_RESOURCES = 'fake_resources'
+MANDATORY_ARGS = 'mandatory_args'
+FORWARD_TUNNEL_ENDPOINT = 'forward_tunnel_endpoint'
+
+NEW_SESSION_PER_TASK = 'new_session_per_task'
+TASK_PRE_EXEC = 'task_pre_exec'
+
+
+# ------------------------------------------------------------------------------
+#
+class AccessSchema(ru.TypedDict):
+
+ _schema = {
+ JOB_MANAGER_ENDPOINT: str,
+ JOB_MANAGER_HOP : str,
+ FILESYSTEM_ENDPOINT : str,
+ }
+
+ _defaults = {
+ JOB_MANAGER_ENDPOINT: None,
+ JOB_MANAGER_HOP : None,
+ FILESYSTEM_ENDPOINT : None,
+ }
+
+
+# ------------------------------------------------------------------------------
+#
+class ResourceDescription(ru.TypedDict):
+ '''
+ docstrings goes here
+ '''
+
+ _schema = {
+ DESCRIPTION : str ,
+ NOTES : str ,
+ DEFAULT_SCHEMA : str ,
+ SCHEMAS : {str: AccessSchema},
+
+ # FIXME: AM - need to resolve since in Session it is moved into RD
+ # `_get_resource_sandbox` -> `KeyError: 'filesystem_endpoint'`
+ JOB_MANAGER_ENDPOINT : str ,
+ JOB_MANAGER_HOP : str ,
+ FILESYSTEM_ENDPOINT : str ,
+
+ DEFAULT_REMOTE_WORKDIR : str ,
+ DEFAULT_QUEUE : str ,
+ RESOURCE_MANAGER : str ,
+ AGENT_CONFIG : str ,
+ AGENT_SCHEDULER : str ,
+ AGENT_SPAWNER : str ,
+ PRE_BOOTSTRAP_0 : [str] ,
+ PRE_BOOTSTRAP_1 : [str] ,
+ RP_VERSION : str ,
+ VIRTENV : str ,
+ VIRTENV_MODE : str ,
+ VIRTENV_DIST : str ,
+ PYTHON_DIST : str ,
+ LAUNCH_METHODS : {str: None},
+ LFS_PATH_PER_NODE : str ,
+ LFS_SIZE_PER_NODE : str ,
+ MEM_PER_NODE : int ,
+ CORES_PER_NODE : int ,
+ GPUS_PER_NODE : int ,
+ SYSTEM_ARCHITECTURE : {str: None},
+
+ FAKE_RESOURCES : bool ,
+ MANDATORY_ARGS : [str] ,
+ FORWARD_TUNNEL_ENDPOINT: str ,
+ NEW_SESSION_PER_TASK : bool ,
+ TASK_PRE_EXEC : [str] ,
+ }
+
+ _defaults = {
+ DESCRIPTION : '' ,
+ NOTES : '' ,
+ DEFAULT_SCHEMA : '' ,
+ SCHEMAS : list() ,
+
+ # FIXME: AM - need to resolve since in Session it is moved into RD
+ # `_get_resource_sandbox` -> `KeyError: 'filesystem_endpoint'`
+ JOB_MANAGER_ENDPOINT : None ,
+ JOB_MANAGER_HOP : None ,
+ FILESYSTEM_ENDPOINT : None ,
+
+ DEFAULT_REMOTE_WORKDIR : '' ,
+ DEFAULT_QUEUE : '' ,
+ RESOURCE_MANAGER : '' ,
+ AGENT_CONFIG : 'default' ,
+ AGENT_SCHEDULER : 'CONTINUOUS',
+ AGENT_SPAWNER : 'POPEN' ,
+ PRE_BOOTSTRAP_0 : [] ,
+ PRE_BOOTSTRAP_1 : [] ,
+ RP_VERSION : '' ,
+ VIRTENV : '' ,
+ VIRTENV_MODE : '' ,
+ VIRTENV_DIST : '' ,
+ PYTHON_DIST : 'default' ,
+ LAUNCH_METHODS : {} ,
+ LFS_PATH_PER_NODE : '' ,
+ LFS_SIZE_PER_NODE : '' ,
+ MEM_PER_NODE : 0 ,
+ CORES_PER_NODE : 0 ,
+ GPUS_PER_NODE : 0 ,
+ SYSTEM_ARCHITECTURE : {} ,
+
+ FAKE_RESOURCES : False ,
+ MANDATORY_ARGS : [] ,
+ FORWARD_TUNNEL_ENDPOINT: '' ,
+ NEW_SESSION_PER_TASK : True ,
+ TASK_PRE_EXEC : [] ,
+ }
+
+
+# ------------------------------------------------------------------------------
diff --git a/src/radical/pilot/session.py b/src/radical/pilot/session.py
index f344d07f18..8b898f9b9e 100644
--- a/src/radical/pilot/session.py
+++ b/src/radical/pilot/session.py
@@ -3,16 +3,52 @@
__license__ = "MIT"
import os
-import sys
import copy
+import time
+
+from typing import Optional
+
+import threading as mt
import radical.utils as ru
import radical.saga as rs
import radical.saga.filesystem as rsfs
import radical.saga.utils.pty_shell as rsup
-from .db import DBSession
-from . import utils as rpu
+from . import constants as rpc
+from . import utils as rpu
+
+from .messages import HeartbeatMessage
+from .proxy import Proxy
+from .resource_description import ResourceDescription
+
+
+# ------------------------------------------------------------------------------
+#
+class _CloseOptions(ru.TypedDict):
+ """Options and validation for Session.close().
+
+ Arguments:
+ download (bool, optional): Fetch pilot profiles and database entries.
+ (Default False.)
+ terminate (bool, optional): Shut down all pilots associated with the
+ session. (Default True.)
+
+ """
+
+ _check = True
+
+ _schema = {
+ 'download' : bool,
+ 'terminate': bool,
+ 'cleanup' : bool # FIXME: to be removed
+ }
+
+ _defaults = {
+ 'download' : False,
+ 'terminate': True,
+ 'cleanup' : True # FIXME: to be removed
+ }
# ------------------------------------------------------------------------------
@@ -38,90 +74,341 @@ class Session(rs.Session):
# sessions of RP client or agent modules), but all components need to call
# the sessions `heartbeat()` method at regular intervals.
- # the reporter is an applicataion-level singleton
+ # the reporter is an application-level singleton
_reporter = None
+ # a session has one of three possible roles:
+ # - primary: the session is the first explicit session instance created in
+ # an RP application.
+ # - agent: the session is the first session instance created in an RP
+ # agent.
+ # - default: any other session instance, for example such as created by
+ # components in the client or agent module.
+ _PRIMARY = 'primary'
+ _AGENT_0 = 'agent_0'
+ _AGENT_N = 'agent_n'
+ _DEFAULT = 'default'
+
+
# --------------------------------------------------------------------------
#
- def __init__(self, dburl=None, uid=None, cfg=None, _primary=True,
- **close_options):
+ def __init__(self, proxy_url: Optional[str ] = None,
+ uid : Optional[str ] = None,
+ cfg : Optional[dict] = None,
+ _role : Optional[str ] = _PRIMARY,
+ _reg_addr: Optional[str ] = None,
+ **close_options):
"""Create a new session.
A new Session instance is created and stored in the database.
+ Any RP Session will require an RP Proxy to facilitate communication
+ between the client machine (i.e., the host where the application created
+ this Session instance) and the target resource (i.e., the host where the
+ pilot agent/s is/are running and where the workload is being executed).
+
+ A `proxy_url` can be specified which then must point to an RP Proxy
+ Service instance which this session can use to establish a communication
+ proxy. If `proxy_url` is not specified, the session will check for the
+ environment variables `RADICAL_PILOT_PROXY_URL` and will interpret it as
+ such above. If that information is not available, the session will
+ instantiate a proxy service on the local host. Note that any proxy
+ service instantiated by the session itself will be terminated once the
+ session instance is closed or goes out of scope and is thus garbage
+ collected and as such should not be used by other session instances.
+
+ Note: an RP proxy will have to be accessible by both the client and the
+ target hosts to facilitate communication between both parties.
+ That implies access to the respective ports. Proxies started by
+ the session itself will use the first port larger than 10.000
+ which is found to be free.
+
Arguments:
- dburl (str, optional):: The MongoDB URL. If none is given,
- RP uses the environment variable RADICAL_PILOT_DBURL. If that is
- not set, an error will be raised.
+
+ proxy_url (str, optional): proxy service URL - points to an RP
+ proxy service which is used to establish an RP communication proxy
+ for this session.
+
+ uid (str, optional): Create a session with this UID. Session UIDs
+ MUST be unique - otherwise they will lead to communication
+ conflicts, resulting in undefined behaviours.
+
cfg (str | dict, optional): a named or instantiated configuration
to be used for the session.
- uid (str, optional): Create a session with this UID. Session UIDs
- MUST be unique - otherwise they will lead to conflicts in the
- underlying database, resulting in undefined behaviours (or worse).
- _primary (bool, optional): only sessions created by the original
- application process (via `rp.Session()`, will connect to the DB.
- Secondary session instances are instantiated internally in
- processes spawned (directly or indirectly) by the initial session,
- for example in some of it's components. A secondary session will
- inherit the original session ID, but will not attempt to create
- a new DB collection - if such a DB connection is needed, the
- component needs to establish that on its own.
- **close_options: If additional key word arguments are provided, they
- will be used as the default arguments to Session.close(). (This
- can be useful when the Session is used as a Python context
- manager, such that close() is called automatically at the end of
- a ``with`` block.)
+ _role (`bool`): only `PRIMARY` sessions created by the original
+ application process (via `rp.Session()`), will create proxies
+ and Registry Services. `AGENT` sessions will also create
+ a Registry but no proxies. All other `DEFAULT` session
+ instances are instantiated internally in processes spawned
+ (directly or indirectly) by the initial session, for example in
+ some of it's components, or by the RP agent. Those sessions
+ will inherit the original session ID, but will not attempt to
+ create a new proxies or registries.
+
+ **close_options (optional): If additional key word arguments are
+ provided, they will be used as the default arguments to
+ Session.close(). This can be useful when the Session is used as
+ a Python context manager, such that close() is called
+ automatically at the end of a ``with`` block.
+
+ _reg_addr (str, optional): Non-primary sessions will connect to the
+ registry at that endpoint and pull session config and resource
+ configurations from there.
"""
+ self._t_start = time.time()
+
+ self._role = _role
+ self._uid = uid
+ self._cfg = ru.Config(cfg=cfg)
+ self._reg_addr = _reg_addr
+ self._proxy_url = proxy_url
+ self._proxy_cfg = None
+ self._closed = False
+ self._created = time.time()
self._close_options = _CloseOptions(close_options)
- # NOTE: `name` and `cfg` are overloaded, the user cannot point to
+ self._close_options.verify()
+
+ self._proxy = None # proxy client instance
+ self._reg = None # registry client instance
+ self._pmgrs = dict() # map IDs to pmgr instances
+ self._tmgrs = dict() # map IDs to tmgr instances
+ self._cmgr = None # only primary sessions have a cmgr
+
+ # this session is either living in the client applicatio or lives in the
+ # scope of a pilot. In the latter case we expect `RP_PILOT_ID` to be
+ # set - we derive the session module scope from that env variable.
+ self._module = os.environ.get('RP_PILOT_ID', 'client')
+
+ # non-primary sessions need a uid!
+ if self._role != self._PRIMARY and not self._uid:
+ raise ValueError('non-primary session needs UID (%s)' % self._role)
+
+ # initialization is different for each session type
+ # NOTE: we could refactor this to session sub-classes
+ if self._role == self._PRIMARY: self._init_primary()
+ elif self._role == self._AGENT_0: self._init_agent_0()
+ elif self._role == self._AGENT_N: self._init_agent_n()
+ else : self._init_default()
+
+ # now we have config and uid - initialize base class (saga session)
+ rs.Session.__init__(self, uid=self._uid)
+
+ # cache sandboxes etc.
+ self._cache_lock = ru.RLock()
+ self._cache = {'endpoint_fs' : dict(),
+ 'resource_sandbox' : dict(),
+ 'session_sandbox' : dict(),
+ 'pilot_sandbox' : dict(),
+ 'client_sandbox' : self._cfg.client_sandbox,
+ 'js_shells' : dict(),
+ 'fs_dirs' : dict()}
+
+ # at this point we have a bridge connection, logger, etc, and are done
+ self._prof.prof('session_ok', uid=self._uid)
+
+ if self._role == self._PRIMARY:
+ self._rep.ok('>>ok\n')
+
+ assert(self._reg)
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _init_primary(self):
+
+ # The primary session
+ # - reads session config files
+ # - reads resource config files
+ # - starts the client side registry service
+ # - pushes the configs into that registry
+ # - pushes bridge and component configs into that registry
+ # - starts a ZMQ proxy (or ensures one is up and running)
+
+ # if user did not set a uid, we need to generate a new ID
+ if not self._uid:
+ self._uid = ru.generate_id('rp.session', mode=ru.ID_PRIVATE)
+
+ # we still call `_init_cfg` to complete missing config settings
+ # FIXME: completion only needed by `PRIMARY`
+ self._init_cfg_from_scratch()
+
+ # primary sessions create a registry service
+ self._start_registry()
+ self._connect_registry()
+
+ # only primary sessions start and initialize the proxy service
+ self._start_proxy()
+
+ # start heartbeat channel
+ self._start_heartbeat()
+
+ # push the session config into the registry
+ self._publish_cfg()
+
+ # start bridges and components
+ self._start_components()
+
+ # primary session hooks into the control pubsub
+ bcfg = self._reg['bridges.%s' % rpc.CONTROL_PUBSUB]
+ self._ctrl_pub = ru.zmq.Publisher(channel=rpc.CONTROL_PUBSUB,
+ url=bcfg['addr_pub'],
+ log=self._log,
+ prof=self._prof)
+
+ # crosswire local channels and proxy channels
+ self._crosswire_proxy()
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _init_agent_0(self):
+
+ # The agent_0 session expects the `cfg` parameter to contain the
+ # complete agent config!
+ #
+ # - starts the agent side registry service
+ # - separates
+ # - session config (== agent config)
+ # - bridge configs
+ # - component configs
+ # - resource config
+ # - pushes them all into the registry
+ # - connects to the ZMQ proxy for client/agent communication
+ # - start agent components
+
+ self._init_cfg_from_dict()
+ self._start_registry()
+ self._connect_registry()
+ self._connect_proxy()
+ self._start_heartbeat()
+ self._publish_cfg()
+ self._start_components()
+ self._crosswire_proxy()
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _init_agent_n(self):
+
+ # The agent_n session fetch their config from agent_0 registry
+ #
+ # - connect to registry
+ # - fetch config from registry
+ # - start agent components
+
+ self._connect_registry()
+ self._init_cfg_from_registry()
+ self._start_components()
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _init_default(self):
+
+ # sub-agents and components connect to an existing registry (owned by
+ # the `primary` session or `agent_0`) and load config settings from
+ # there.
+
+ self._connect_registry()
+ self._init_cfg_from_registry()
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _start_registry(self):
+
+ # make sure that no other registry is used
+ if self._reg_addr:
+ raise ValueError('cannot start registry when providing `reg_addr`')
+
+ self._reg_service = ru.zmq.Registry(uid='%s.reg' % self._uid,
+ path=self._cfg.path)
+ self._reg_service.start()
+
+ self._cfg.reg_addr = self._reg_service.addr
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _connect_registry(self):
+
+ if not self._cfg.reg_addr:
+ self._cfg.reg_addr = self._reg_addr
+
+ if not self._cfg.reg_addr:
+ raise ValueError('session needs a registry address')
+
+ self._reg = ru.zmq.RegistryClient(url=self._cfg.reg_addr)
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _init_cfg_from_scratch(self):
+
+ # A primary session will at this point have a registry client connected
+ # to its registry service. Further, self._cfg will either be a config
+ # name to be read from disk (`session_.json`), or a dictionary
+ # with a specific, user provided config. From this information clean up
+ # `self._cfg` and store it in the registry. Also read resource configs
+ # and store the in the registry as well.
+
+ # NOTE: `cfg_name` and `cfg` are overloaded, the user cannot point to
# a predefined config and amend it at the same time. This might
- # be ok for the session, but introduces a minor API inconsistency.
- name = 'default'
- if isinstance(cfg, str):
- name = cfg
- cfg = None
+ # be ok for the session, but introduces an API inconsistency.
- self._dbs = None
- self._closed = False
- self._primary = _primary
+ cfg_name = 'default'
+ if isinstance(self._cfg, str):
+ cfg_name = self._cfg
+ self._cfg = None
- self._pmgrs = dict() # map IDs to pmgr instances
- self._tmgrs = dict() # map IDs to tmgr instances
- self._cmgr = None # only primary sessions have a cmgr
+ # load the named config, merge provided config
+ self._cfg = ru.Config('radical.pilot.session', name=cfg_name,
+ cfg=self._cfg)
- self._cfg = ru.Config('radical.pilot.session', name=name, cfg=cfg)
- self._rcfgs = ru.Config('radical.pilot.resource', name='*', expand=False)
+ rcfgs = ru.Config('radical.pilot.resource', name='*', expand=False)
+ rcfgs_ext = {}
- for site in self._rcfgs:
- for rcfg in self._rcfgs[site].values():
- for schema in rcfg.get('schemas', []):
- while isinstance(rcfg.get(schema), str):
- tgt = rcfg[schema]
- rcfg[schema] = rcfg[tgt]
+ for site in rcfgs:
+ rcfgs_ext[site] = {}
+ for res, rcfg in rcfgs[site].items():
+ rcfgs_ext[site][res] = {
+ 'default_schema': rcfg['default_schema'],
+ 'schemas' : rcfg.get('schemas', {})
+ }
+ for schema in rcfg.get('schemas', {}):
+ while isinstance(rcfg['schemas'][schema], str):
+ tgt = rcfg['schemas'][schema]
+ rcfg['schemas'][schema] = rcfg['schemas'][tgt]
+ for schema in rcfg.get('schemas', {}):
+ rcfgs_ext[site][res][schema] = rcfgs[site][res].as_dict()
+ ru.dict_merge(rcfgs_ext[site][res][schema],
+ rcfgs[site][res]['schemas'][schema])
+ del rcfgs_ext[site][res][schema]['default_schema']
- if _primary:
+ for site in rcfgs_ext:
+ for res, rcfg in rcfgs_ext[site].items():
+ for schema in rcfg.get('schemas', {}):
+ rd = ResourceDescription(from_dict=rcfg[schema])
+ rd.verify()
- pwd = os.getcwd()
+ self._rcfgs = ru.Config(from_dict=rcfgs_ext)
+ self._rcfg = ru.Config() # the local resource config, if known
- if not self._cfg.sid:
- if uid:
- self._cfg.sid = uid
- else:
- self._cfg.sid = ru.generate_id('rp.session',
- mode=ru.ID_PRIVATE)
- if not self._cfg.base:
- self._cfg.base = pwd
+ # set essential config values for *this* specific session
+ self._cfg['sid'] = self._uid
+
+ pwd = os.getcwd()
+
+ if not self._cfg.base:
+ self._cfg.base = pwd
- if not self._cfg.path:
- self._cfg.path = '%s/%s' % (self._cfg.base, self._cfg.sid)
+ if not self._cfg.path:
+ self._cfg.path = '%s/%s' % (self._cfg.base, self._cfg.sid)
- if not self._cfg.client_sandbox:
- self._cfg.client_sandbox = pwd
+ if not self._cfg.client_sandbox:
+ self._cfg.client_sandbox = pwd
- else:
- for k in ['sid', 'base', 'path']:
- assert k in self._cfg, 'non-primary session misses %s' % k
# change RU defaults to point logfiles etc. to the session sandbox
def_cfg = ru.DefaultConfig()
@@ -129,103 +416,397 @@ def __init__(self, dburl=None, uid=None, cfg=None, _primary=True,
def_cfg.report_dir = self._cfg.path
def_cfg.profile_dir = self._cfg.path
- self._uid = self._cfg.sid
+ self._prof = self._get_profiler(name=self._uid)
+ self._rep = self._get_reporter(name=self._uid)
+ self._log = self._get_logger (name=self._uid,
+ level=self._cfg.get('log_lvl'),
+ debug=self._cfg.get('debug_lvl'))
+
+ from . import version_detail as rp_version_detail
+ self._log.info('radical.pilot version: %s', rp_version_detail)
+ self._log.info('radical.saga version: %s', rs.version_detail)
+ self._log.info('radical.utils version: %s', ru.version_detail)
+
+ self._prof.prof('session_start', uid=self._uid)
+
+ self._rep.info ('<>err\n")
- self._log.exception('session create failed [%s]', dburl_no_passwd)
- raise RuntimeError ('session create failed [%s]' %
- dburl_no_passwd) from e
+ def pubsub_fwd(topic, msg):
- # primary sessions have a component manager which also manages
- # heartbeat. 'self._cmgr.close()` should be called during termination
- import pprint
- self._log.debug('cmgr cfg: %s', pprint.pformat(self._cfg))
- self._cmgr = rpu.ComponentManager(self._cfg)
- self._cmgr.start_bridges()
- self._cmgr.start_components()
+ if 'origin' not in msg:
+ msg['origin'] = self._module
- # expose the cmgr's heartbeat channel to anyone who wants to use it
- self._cfg.heartbeat = self._cmgr.cfg.heartbeat # pylint: disable=E1101
+ if from_proxy:
- self._rec = False
- if self._cfg.record:
+ # all messages *from* the proxy are forwarded - but not the ones
+ # which originated in *this* module in the first place.
- # append session ID to recording path
- self._rec = "%s/%s" % (self._rec, self._uid)
+ if msg['origin'] == self._module:
+ # self._log.debug('XXX >=! fwd %s to topic:%s: %s', src, tgt, msg)
+ return
- # create recording path and record session
- os.system('mkdir -p %s' % self._rec)
- ru.write_json({'dburl': str(self.dburl)},
- "%s/session.json" % self._rec)
- self._log.info("recording session in %s", self._rec)
+ # self._log.debug('XXX >=> fwd %s to topic:%s: %s', src, tgt, msg)
+ publisher.put(tgt, msg)
- self._rep.ok('>>ok\n')
+ else:
+
+ # only forward messages which have the respective flag set
+ if not msg.get('fwd'):
+ # self._log.debug('XXX =>! fwd %s to %s: %s [%s - %s]', src,
+ # tgt, msg, msg['origin'], self._module)
+ return
+
+ # avoid message loops (forward only once)
+ msg['fwd'] = False
+
+ # only forward all messages which originated in *this* module.
+
+ if not msg['origin'] == self._module:
+ # self._log.debug('XXX =>| fwd %s to topic:%s: %s', src, tgt, msg)
+ return
+
+ # self._log.debug('XXX =>> fwd %s to topic:%s: %s', src, tgt, msg)
+ publisher.put(tgt, msg)
+
+
+ ru.zmq.Subscriber(channel=src, topic=src, path=path, cb=pubsub_fwd,
+ url=url_sub, log=self._log, prof=self._prof)
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _crosswire_proxy(self):
+
+ # - forward local ctrl messages to control proxy
+ # - forward local state updates to state proxy
+ # - forward local task queue to proxy task queue
+ #
+ # - forward proxy ctrl messages to local control pubsub
+ # - forward proxy state updates to local state pubsub
+ # - forward proxy task queue messages to local task queue
+ #
+ # The local task queue endpoints differ for primary session and agent_0
+ #
+ # - primary:
+ # - forward from AGENT_STAGING_INPUT_PENDING_QUEUE
+ # - forward to TMGR_STAGING_OUTPUT_PENDING_QUEUE
+ #
+ # - agent_0:
+ # - forward to AGENT_STAGING_INPUT_PENDING_QUEUE
+ # - forward from TMGR_STAGING_OUTPUT_PENDING_QUEUE
+ #
+ # NOTE: the primary session task queues don't live in the session itself
+ # but are owned by the task manager instead - it will trigger the
+ # crosswire once the queues are created.
+
+ assert self._role in [self._PRIMARY, self._AGENT_0]
+
+ self.crosswire_pubsub(src=rpc.CONTROL_PUBSUB,
+ tgt=rpc.PROXY_CONTROL_PUBSUB,
+ from_proxy=False)
+ self.crosswire_pubsub(src=rpc.PROXY_CONTROL_PUBSUB,
+ tgt=rpc.CONTROL_PUBSUB,
+ from_proxy=True)
+
+ self.crosswire_pubsub(src=rpc.STATE_PUBSUB,
+ tgt=rpc.PROXY_STATE_PUBSUB,
+ from_proxy=False)
+ self.crosswire_pubsub(src=rpc.PROXY_STATE_PUBSUB,
+ tgt=rpc.STATE_PUBSUB,
+ from_proxy=True)
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _start_components(self):
+
+ assert self._role in [self._PRIMARY, self._AGENT_0, self._AGENT_N]
+
+ # primary sessions and agents have a component manager which also
+ # manages heartbeat. 'self._cmgr.close()` should be called during
+ # termination
+ self._cmgr = rpu.ComponentManager(self.uid, self.reg_addr, self._uid)
+ self._cmgr.start_bridges(self._cfg.bridges)
+ self._cmgr.start_components(self._cfg.components)
# --------------------------------------------------------------------------
@@ -243,22 +824,24 @@ def __exit__(self, exc_type, exc_value, traceback):
def close(self, **kwargs):
"""Close the session.
- All subsequent attempts access objects attached to
- the session will result in an error. If cleanup is set to True,
- the session data is removed from the database.
+ All subsequent attempts access objects attached to the session will
+ result in an error. If cleanup is set to True, the session data is
+ removed from the database.
Arguments:
- cleanup (bool, optional): Remove session from MongoDB (implies *terminate=True*)
- terminate (bool, optional): Shut down all pilots associated with the session.
- download (bool, optional): Fetch pilot profiles and database entries.
-
+ terminate (bool, optional): Shut down all pilots associated with the
+ session.
+ download (bool, optional): Fetch pilot profiles and database
+ entries.
"""
# close only once
if self._closed:
return
- self._rep.info('closing session %s' % self._uid)
+ if self._role == self._PRIMARY:
+ self._rep.info('closing session %s' % self._uid)
+
self._log.debug("session %s closing", self._uid)
self._prof.prof("session_close", uid=self._uid)
@@ -271,6 +854,12 @@ def close(self, **kwargs):
options = self._close_options
+ if options.terminate:
+ # terminate all components
+ if self._role == self._PRIMARY:
+ self._ctrl_pub.put(rpc.CONTROL_PUBSUB, {'cmd': 'terminate',
+ 'arg': None})
+
for tmgr_uid, tmgr in self._tmgrs.items():
self._log.debug("session %s closes tmgr %s", self._uid, tmgr_uid)
tmgr.close()
@@ -284,11 +873,24 @@ def close(self, **kwargs):
if self._cmgr:
self._cmgr.close()
- if self._dbs:
- self._log.debug("session %s closes db (%s)", self._uid, options.cleanup)
- self._dbs.close(delete=options.cleanup)
+ # stop heartbeats
+ self._hb.stop()
+ self._hb_pubsub.stop()
+
+ if self._proxy:
- self._log.debug("session %s closed (delete=%s)", self._uid, options.cleanup)
+ if self._role == self._PRIMARY:
+ try:
+ self._log.debug('session %s closes service', self._uid)
+ self._proxy.request('unregister', {'sid': self._uid})
+ except:
+ pass
+
+ if self._role in [self._PRIMARY, self._AGENT_0]:
+ self._proxy.close()
+ self._proxy = None
+
+ self._log.debug("session %s closed", self._uid)
self._prof.prof("session_stop", uid=self._uid)
self._prof.close()
@@ -301,16 +903,43 @@ def close(self, **kwargs):
self._prof.prof("session_fetch_start", uid=self._uid)
self._log.debug('start download')
tgt = self._cfg.base
- self.fetch_json (tgt=tgt)
+ # # FIXME: MongoDB
+ # self.fetch_json (tgt='%s/%s' % (tgt, self.uid))
self.fetch_profiles(tgt=tgt)
self.fetch_logfiles(tgt=tgt)
self._prof.prof("session_fetch_stop", uid=self._uid)
- if self.closed and self.created:
- self._rep.info('<>ok\n')
+ if self._role == self._PRIMARY:
+
+ # stop registry
+ self._reg.close()
+ self._reg_service.stop() # this will dump registry
+
+ self._t_stop = time.time()
+ self._rep.info('<>ok\n')
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _run_proxy(self):
+
+ proxy = Proxy(path=self._cfg.path)
+
+ try:
+ proxy.start()
+
+ self._proxy_url = proxy.addr
+ self._proxy_event.set()
+
+ # run forever until process is interrupted or killed
+ proxy.wait()
+
+ finally:
+ proxy.stop()
+ proxy.wait()
# --------------------------------------------------------------------------
@@ -319,21 +948,18 @@ def as_dict(self):
"""Returns a Python dictionary representation of the object."""
object_dict = {
- "uid" : self._uid,
- "created" : self.created,
- "connected" : self.connected,
- "closed" : self.closed,
- "dburl" : str(self.dburl),
- "cfg" : copy.deepcopy(self._cfg)
+ 'uid' : self._uid,
+ 'proxy_url': str(self.proxy_url),
+ 'cfg' : copy.deepcopy(self._cfg)
}
return object_dict
# --------------------------------------------------------------------------
#
- def __str__(self):
- """Returns a string representation of the object."""
- return str(self.as_dict())
+ @property
+ def reg_addr(self):
+ return self._cfg.reg_addr
# --------------------------------------------------------------------------
@@ -343,13 +969,6 @@ def uid(self):
return self._uid
- # --------------------------------------------------------------------------
- #
- @property
- def base(self):
- return self._cfg.base
-
-
# --------------------------------------------------------------------------
#
@property
@@ -360,23 +979,15 @@ def path(self):
# --------------------------------------------------------------------------
#
@property
- def dburl(self):
- return self._cfg.dburl
-
-
- # --------------------------------------------------------------------------
- #
- def get_db(self):
-
- if self._dbs: return self._dbs.get_db()
- else : return None
+ def base(self):
+ return self._cfg.base
# --------------------------------------------------------------------------
#
@property
- def primary(self):
- return self._primary
+ def proxy_url(self):
+ return self._cfg.proxy_url
# --------------------------------------------------------------------------
@@ -389,70 +1000,36 @@ def cfg(self):
# --------------------------------------------------------------------------
#
@property
- def cmgr(self):
- assert self._primary
- return self._cmgr
+ def rcfgs(self):
+ return self._rcfgs
# --------------------------------------------------------------------------
#
@property
- def created(self):
- """float: The UTC date and time the session was created."""
- if self._dbs: ret = self._dbs.created
- else : ret = None
-
- if ret:
- return float(ret)
-
-
- # --------------------------------------------------------------------------
- #
- @property
- def connected(self):
- '''
- Return time when the session connected to the DB
- '''
-
- if self._dbs: ret = self._dbs.connected
- else : ret = None
-
- if ret:
- return float(ret)
+ def rcfg(self):
+ return self._rcfg
# --------------------------------------------------------------------------
#
@property
- def is_connected(self):
-
- return self._dbs.is_connected
-
-
- # --------------------------------------------------------------------------
- #
- @property
- def closed(self):
- '''
- Returns the time of closing
- '''
- if self._dbs: ret = self._dbs.closed
- else : ret = None
-
- if ret:
- return float(ret)
+ def cmgr(self):
+ return self._cmgr
# --------------------------------------------------------------------------
#
- def _get_logger(self, name, level=None):
+ def _get_logger(self, name, level=None, debug=None):
"""Get the Logger instance.
This is a thin wrapper around `ru.Logger()` which makes sure that
log files end up in a separate directory with the name of `session.uid`.
"""
- return ru.Logger(name=name, ns='radical.pilot', path=self._cfg.path,
- targets=['.'], level=level)
+ log = ru.Logger(name=name, ns='radical.pilot', path=self._cfg.path,
+ targets=['.'], level=level, debug=debug)
+
+ return log
# --------------------------------------------------------------------------
@@ -495,30 +1072,30 @@ def inject_metadata(self, metadata):
if not isinstance(metadata, dict):
raise Exception("Session metadata should be a dict!")
- if self._dbs and self._dbs._c:
- self._dbs._c.update({'type' : 'session',
- "uid" : self.uid},
- {"$push" : {"metadata": metadata}})
+ # FIXME MONGODB: to json
+ # if self._dbs and self._dbs._c:
+ # self._dbs._c.update({'type' : 'session',
+ # "uid" : self.uid},
+ # {"$push" : {"metadata": metadata}})
# --------------------------------------------------------------------------
#
def _register_pmgr(self, pmgr):
- self._dbs.insert_pmgr(pmgr.as_dict())
- self._pmgrs[pmgr.uid] = pmgr
-
-
- # --------------------------------------------------------------------------
- #
- def _reconnect_pmgr(self, pmgr):
-
- if not self._dbs.get_pmgrs(pmgr_ids=pmgr.uid):
- raise ValueError('could not reconnect to pmgr %s' % pmgr.uid)
-
self._pmgrs[pmgr.uid] = pmgr
+ # # --------------------------------------------------------------------------
+ # #
+ # def _reconnect_pmgr(self, pmgr):
+ #
+ # if not self._dbs.get_pmgrs(pmgr_ids=pmgr.uid):
+ # raise ValueError('could not reconnect to pmgr %s' % pmgr.uid)
+ #
+ # self._pmgrs[pmgr.uid] = pmgr
+ #
+ #
# --------------------------------------------------------------------------
#
def list_pilot_managers(self):
@@ -541,10 +1118,12 @@ def get_pilot_managers(self, pmgr_uids=None):
"""Get known PilotManager(s).
Arguments:
- pmgr_uids (str | Iterable[str], optional): Unique identifier of the PilotManager(s) we want.
+ pmgr_uids (str | Iterable[str], optional): uids of the PilotManagers
+ we want.
Returns:
- radical.pilot.PilotManager | list[radical.pilot.PilotManager]: One or more `radical.pilot.PilotManager` objects.
+ radical.pilot.PilotManager | list[radical.pilot.PilotManager]: One
+ or more `radical.pilot.PilotManager` objects.
"""
@@ -564,20 +1143,19 @@ def get_pilot_managers(self, pmgr_uids=None):
#
def _register_tmgr(self, tmgr):
- self._dbs.insert_tmgr(tmgr.as_dict())
- self._tmgrs[tmgr.uid] = tmgr
-
-
- # --------------------------------------------------------------------------
- #
- def _reconnect_tmgr(self, tmgr):
-
- if not self._dbs.get_tmgrs(tmgr_ids=tmgr.uid):
- raise ValueError('could not reconnect to tmgr %s' % tmgr.uid)
-
self._tmgrs[tmgr.uid] = tmgr
+ # # --------------------------------------------------------------------------
+ # #
+ # def _reconnect_tmgr(self, tmgr):
+ #
+ # if not self._dbs.get_tmgrs(tmgr_ids=tmgr.uid):
+ # raise ValueError('could not reconnect to tmgr %s' % tmgr.uid)
+ #
+ # self._tmgrs[tmgr.uid] = tmgr
+ #
+ #
# --------------------------------------------------------------------------
#
def list_task_managers(self):
@@ -587,7 +1165,7 @@ def list_task_managers(self):
instances associated with this session.
Returns:
- list[str]: A list of :class:`radical.pilot.TaskManager` uids (`list` of `strings`).
+ list[str]: A list of :class:`radical.pilot.TaskManager` uids.
"""
@@ -600,7 +1178,7 @@ def get_task_managers(self, tmgr_uids=None):
"""Get known TaskManager(s).
Arguments:
- tmgr_uids (str | list[str]): Unique identifier of the TaskManager we want
+ tmgr_uids (str | list[str]): uids of the TaskManagers we want
Returns:
radical.pilot.TaskManager | list[radical.pilot.TaskManager]:
@@ -651,12 +1229,12 @@ def get_resource_config(self, resource, schema=None):
resource_cfg = copy.deepcopy(self._rcfgs[domain][host])
- if not schema:
- if 'schemas' in resource_cfg:
- schema = resource_cfg['schemas'][0]
+ if not schema:
+ schema = resource_cfg.get('default_schema')
+
+ if schema:
- if schema:
- if schema not in resource_cfg:
+ if schema not in resource_cfg['schemas']:
raise RuntimeError("schema %s unknown for resource %s"
% (schema, resource))
@@ -669,28 +1247,29 @@ def get_resource_config(self, resource, schema=None):
return resource_cfg
- # --------------------------------------------------------------------------
- #
- def fetch_json(self, tgt=None):
-
- return rpu.fetch_json(self._uid, dburl=self.dburl, tgt=tgt,
- session=self, skip_existing=True)
-
+ # # --------------------------------------------------------------------------
+ # #
+ # def fetch_json(self, tgt=None):
+ #
+ # return rpu.fetch_json(self._uid, tgt=tgt, session=self,
+ # skip_existing=True)
+ #
+ #
# --------------------------------------------------------------------------
#
def fetch_profiles(self, tgt=None):
- return rpu.fetch_profiles(self._uid, dburl=self.dburl, tgt=tgt,
- session=self, skip_existing=True)
+ return rpu.fetch_profiles(self._uid, tgt=tgt, session=self,
+ skip_existing=True)
# --------------------------------------------------------------------------
#
def fetch_logfiles(self, tgt=None):
- return rpu.fetch_logfiles(self._uid, dburl=self.dburl, tgt=tgt,
- session=self, skip_existing=True)
+ return rpu.fetch_logfiles(self._uid, tgt=tgt, session=self,
+ skip_existing=True)
# --------------------------------------------------------------------------
@@ -698,9 +1277,9 @@ def fetch_logfiles(self, tgt=None):
def _get_client_sandbox(self):
"""Client sandbox path.
- For the session in the client application, this is `os.getcwd()`. For the
- session in any other component, specifically in pilot components, the
- client sandbox needs to be read from the session config (or pilot
+ For the session in the client application, this is `os.getcwd()`. For
+ the session in any other component, specifically in pilot components,
+ the client sandbox needs to be read from the session config (or pilot
config). The latter is not yet implemented, so the pilot can not yet
interpret client sandboxes. Since pilot-side staging to and from the
client sandbox is not yet supported anyway, this seems acceptable
@@ -819,7 +1398,7 @@ def get_js_shell(self, resource, schema):
else: raise Exception("invalid schema: %s" % js_url.schema)
if js_url.schema == 'fork':
- js_url.hostname = 'localhost'
+ js_url.host = 'localhost'
self._log.debug("rsup.PTYShell('%s')", js_url)
shell = rsup.PTYShell(js_url, self)
@@ -898,7 +1477,7 @@ def _get_endpoint_fs(self, pilot):
resource = pilot['description'].get('resource')
if not resource:
- raise ValueError('Cannot get endpoint filesystem w/o resource target')
+ raise ValueError("Can't get fs-endpoint w/o resource target")
with self._cache_lock:
@@ -950,7 +1529,7 @@ def _get_task_sandbox(self, task, pilot):
# --------------------------------------------------------------------------
#
def _get_jsurl(self, pilot):
- """Get job service endpoint and hop URL for the pilot's target resource."""
+ """Get job service endpoint and hop URL for pilot's target resource."""
resrc = pilot['description']['resource']
schema = pilot['description']['access_schema']
@@ -971,101 +1550,5 @@ def _get_jsurl(self, pilot):
return js_url, js_hop
- # --------------------------------------------------------------------------
- #
- @staticmethod
- def autopilot(user, passwd):
-
- try:
- import github3
- except ImportError:
- print('ERROR: github3 library is not available')
- return
- import random
-
- labels = 'type:autopilot'
- titles = ['+++ Out of Cheese Error +++',
- '+++ Redo From Start! +++',
- '+++ Mr. Jelly! Mr. Jelly! +++',
- '+++ Melon melon melon',
- '+++ Wahhhhhhh! Mine! +++',
- '+++ Divide By Cucumber Error +++',
- '+++ Please Reinstall Universe And Reboot +++',
- '+++ Whoops! Here comes the cheese! +++',
- '+++ End of Cheese Error +++',
- '+++ Can Not Find Drive Z: +++',
- '+++ Unknown Application Error +++',
- '+++ Please Reboot Universe +++',
- '+++ Year Of The Sloth +++',
- '+++ error of type 5307 has occured +++',
- '+++ Eternal domain error +++',
- '+++ Error at Address Number 6, Treacle Mine Road +++']
-
- def excuse():
- cmd_fetch = "telnet bofh.jeffballard.us 666 2>&1 "
- cmd_filter = "grep 'Your excuse is:' | cut -f 2- -d :"
- out = ru.sh_callout("%s | %s" % (cmd_fetch, cmd_filter),
- shell=True)[0]
- return out.strip()
-
- github = github3.login(user, passwd)
- repo = github.repository("radical-cybertools", "radical.pilot")
-
- title = 'autopilot: %s' % titles[random.randint(0, len(titles) - 1)]
-
- print('----------------------------------------------------')
- print('autopilot')
-
- for issue in repo.issues(labels=labels, state='open'):
- if issue.title == title:
- reply = 'excuse: %s' % excuse()
- issue.create_comment(reply)
- print(' resolve: %s' % reply)
- return
-
- # issue not found - create
- body = 'problem: %s' % excuse()
- issue = repo.create_issue(title=title, body=body, labels=[labels],
- assignee=user)
- print(' issue : %s' % title)
- print(' problem: %s' % body)
- print('----------------------------------------------------')
-
-
# ------------------------------------------------------------------------------
-#
-class _CloseOptions(ru.TypedDict):
- """Options and validation for Session.close().
-
- Arguments:
- cleanup (bool, optional): Remove session from MongoDB.
- Implies *terminate=True*. (default False)
- download (bool, optional): Fetch pilot profiles and database entries.
- (Default False.)
- terminate (bool, optional): Shut down all pilots associated with the
- session. (Default True.)
-
- """
-
- _schema = {
- 'cleanup' : bool,
- 'download' : bool,
- 'terminate': bool
- }
-
- _defaults = {
- 'cleanup' : False,
- 'download' : False,
- 'terminate': True
- }
-
-
- # --------------------------------------------------------------------------
- #
- def _verify(self):
-
- if self.get('cleanup') and not self.get('terminate'):
- self.terminate = True
-
-# ------------------------------------------------------------------------------
diff --git a/src/radical/pilot/staging_directives.py b/src/radical/pilot/staging_directives.py
index 878d53b4b2..fecae1ae05 100644
--- a/src/radical/pilot/staging_directives.py
+++ b/src/radical/pilot/staging_directives.py
@@ -215,7 +215,11 @@ def complete_url(path : str,
# we expect hostname elements to be absent for schemas we expand
if purl.host:
- raise ValueError('URLs cannot specify `host` for expanded schemas')
+ try:
+ raise ValueError('URLs cannot specify `host` for expanded schemas')
+ except:
+ log.exception('purl host: %s' % str(purl))
+ raise
if purl.schema == 'file':
# we leave `file://` URLs unaltered
diff --git a/src/radical/pilot/task.py b/src/radical/pilot/task.py
index 706e3e07bc..19cf937450 100644
--- a/src/radical/pilot/task.py
+++ b/src/radical/pilot/task.py
@@ -505,7 +505,7 @@ def wait(self, state=None, timeout=None):
# we will never see another state progression. Raise an error
# (unless we waited for this)
if self.state in states:
- return
+ return self.state
# FIXME: do we want a raise here, really? This introduces a race,
# really, on application level
diff --git a/src/radical/pilot/task_manager.py b/src/radical/pilot/task_manager.py
index b5724a62de..c7e65f0ed4 100644
--- a/src/radical/pilot/task_manager.py
+++ b/src/radical/pilot/task_manager.py
@@ -6,6 +6,7 @@
import os
import sys
import time
+import queue
import threading as mt
import radical.utils as ru
@@ -50,7 +51,7 @@ class TaskManager(rpu.Component):
Example::
- s = rp.Session(database_url=DBURL)
+ s = rp.Session()
pm = rp.PilotManager(session=s)
@@ -87,7 +88,7 @@ class TaskManager(rpu.Component):
# --------------------------------------------------------------------------
#
- def __init__(self, session, cfg='default', scheduler=None, uid=None):
+ def __init__(self, session, cfg='default', scheduler=None):
"""Create a new TaskManager and attaches it to the session.
Arguments:
@@ -101,14 +102,14 @@ def __init__(self, session, cfg='default', scheduler=None, uid=None):
"""
+ assert session._role == session._PRIMARY, 'tmgr needs primary session'
+
# initialize the base class (with no intent to fork)
- if uid:
- self._reconnect = True
- self._uid = uid
- else:
- self._reconnect = False
- self._uid = ru.generate_id('tmgr.%(item_counter)04d',
- ru.ID_CUSTOM, ns=session.uid)
+ self._uid = ru.generate_id('tmgr.%(item_counter)04d',
+ ru.ID_CUSTOM, ns=session.uid)
+
+ if not scheduler:
+ scheduler = rpc.SCHEDULER_ROUND_ROBIN
self._pilots = dict()
self._pilots_lock = mt.RLock()
@@ -118,6 +119,7 @@ def __init__(self, session, cfg='default', scheduler=None, uid=None):
self._tcb_lock = mt.RLock()
self._terminate = mt.Event()
self._closed = False
+ self._task_info = dict()
for m in rpc.TMGR_METRICS:
self._callbacks[m] = dict()
@@ -135,33 +137,30 @@ def __init__(self, session, cfg='default', scheduler=None, uid=None):
cfg.uid = self._uid
cfg.owner = self._uid
cfg.sid = session.uid
- cfg.base = session.base
cfg.path = session.path
- cfg.dburl = session.dburl
+ cfg.reg_addr = session.reg_addr
cfg.heartbeat = session.cfg.heartbeat
cfg.client_sandbox = session._get_client_sandbox()
- if scheduler:
- # overwrite the scheduler from the config file
- cfg.scheduler = scheduler
-
rpu.Component.__init__(self, cfg, session=session)
self.start()
self._log.info('started tmgr %s', self._uid)
+
+ self._rep = self._session._get_reporter(name=self._uid)
self._rep.info('<>ok\n')
@@ -248,6 +245,16 @@ def close(self):
self._closed = True
self._rep.ok('>>ok\n')
+ # dump json
+ json = self.as_dict()
+ # json['_id'] = self.uid
+ json['type'] = 'tmgr'
+ json['uid'] = self.uid
+ json['tasks'] = self._task_info
+
+ tgt = '%s/%s.json' % (self._session.path, self.uid)
+ ru.write_json(json, tgt)
+
# --------------------------------------------------------------------------
#
@@ -311,185 +318,26 @@ def _pilot_state_cb(self, pilots, state=None):
if state in rps.FINAL:
- self._log.debug('pilot %s is final - pull tasks', pilot.uid)
+ self._log.debug('pilot %s is final', pilot.uid)
- task_cursor = self.session._dbs._c.find({
- 'type' : 'task',
- 'pilot' : pilot.uid,
- 'tmgr' : self.uid,
- 'control' : {'$in' : ['agent_pending', 'agent']}})
-
- if not task_cursor.count():
- tasks = list()
- else:
- tasks = list(task_cursor)
-
- self._log.debug("tasks pulled: %3d (pilot dead)", len(tasks))
-
- if not tasks:
- continue
-
- # update the tasks to avoid pulling them again next time.
- # NOTE: this needs not locking with the task pulling in the
- # _task_pull_cb, as that will only pull tmgr_pending
- # tasks.
- uids = [task['uid'] for task in tasks]
-
- self._session._dbs._c.update({'type' : 'task',
- 'uid' : {'$in' : uids}},
- {'$set' : {'control' : 'tmgr'}},
- multi=True)
- to_restart = list()
- for task in tasks:
-
- task['exception'] = 'RuntimeError("pilot died")'
- task['exception_detail'] = 'pilot %s is final' % pid
- task['state'] = rps.FAILED
-
- if not task['description'].get('restartable'):
- self._log.debug('task %s not restartable', task['uid'])
- continue
-
- self._log.debug('task %s is restartable', task['uid'])
- task['restarted'] = True
- td = TaskDescription(task['description'])
- to_restart.append(td)
- # FIXME: increment some restart counter in the description?
- # FIXME: reference the resulting new uid in the old task.
-
- if to_restart and not self._closed:
- self._log.debug('restart %s tasks', len(to_restart))
- restarted = self.submit_tasks(to_restart)
- for u in restarted:
- self._log.debug('restart task %s', u.uid)
-
- # final tasks are not pushed
- self.advance(tasks, publish=True, push=False)
+ # FIXME: MongoDB
+ # TODO: fail all non-final tasks which were assigned to that
+ # pilot
+ continue
+ ## for task in tasks:
+ ##
+ ## task['exception'] = 'RuntimeError("pilot died")'
+ ## task['exception_detail'] = 'pilot %s is final' % pid
+ ## task['state'] = rps.FAILED
+ ##
+ ## # final tasks are not pushed
+ ## self.advance(tasks, publish=True, push=False)
# keep cb registered
return True
- # --------------------------------------------------------------------------
- #
- def _state_pull_cb(self):
-
- if self._terminate.is_set():
- return False
-
- # pull all task states from the DB, and compare to the states we know
- # about. If any state changed, update the task instance and issue
- # notification callbacks as needed. Do not advance the state (again).
- # FIXME: we also pull for dead tasks. That is not efficient...
- # FIXME: this needs to be converted into a tailed cursor in the update
- # worker
- tasks = self._session._dbs.get_tasks(tmgr_uid=self.uid)
- self._update_tasks(tasks)
-
- return True
-
-
- # --------------------------------------------------------------------------
- #
- def _task_pull_cb(self):
-
- if self._terminate.is_set():
- return False
-
- # pull tasks from the agent which are about to get back
- # under tmgr control, and push them into the respective queues
- # FIXME: this should also be based on a tailed cursor
- # FIXME: Unfortunately, 'find_and_modify' is not bulkable, so we have
- # to use 'find'. To avoid finding the same tasks over and over
- # again, we update the 'control' field *before* running the next
- # find -- so we do it right here.
- task_cursor = self.session._dbs._c.find({'type' : 'task',
- 'tmgr' : self.uid,
- 'control' : 'tmgr_pending'})
-
- if not task_cursor.count():
- # no tasks whatsoever...
- # self._log.info("tasks pulled: 0")
- return True # this is not an error
-
- # update the tasks to avoid pulling them again next time.
- tasks = list(task_cursor)
- uids = [task['uid'] for task in tasks]
-
- self._log.info("tasks pulled: %d", len(uids))
-
- for task in tasks:
- task['control'] = 'tmgr'
-
- self._session._dbs._c.update({'type' : 'task',
- 'uid' : {'$in' : uids}},
- {'$set' : {'control' : 'tmgr'}},
- multi=True)
-
- self._log.info("tasks pulled: %4d", len(tasks))
- self._prof.prof('get', msg="bulk size: %d" % len(tasks), uid=self.uid)
- for task in tasks:
-
- # we need to make sure to have the correct state:
- uid = task['uid']
- self._prof.prof('get', uid=uid)
-
- old = task['state']
- new = rps._task_state_collapse(task['states'])
-
- if old != new:
- self._log.debug("task pulled %s: %s / %s", uid, old, new)
-
- task['state'] = new
-
- # now we really own the CUs, and can start working on them (ie. push
- # them into the pipeline).
-
- to_stage = list()
- to_finalize = list()
-
- for task in tasks:
- # only advance tasks to data stager if we need data staging
- # = otherwise finalize them right away
- if task['description'].get('output_staging'):
- if task['target_state'] != rps.DONE:
- if task['description']['stage_on_error']:
- to_stage.append(task)
- else:
- to_finalize.append(task)
- else:
- to_stage.append(task)
- else:
- to_finalize.append(task)
-
- # don't profile state transitions - those happened in the past
- if to_stage:
- if self._has_sout:
- # normal route: needs data stager
- self.advance(to_stage, publish=True, push=True, prof=False)
- else:
- self._log.error('output staging needed but not available!')
- for task in to_stage:
- task['target_state'] = rps.FAILED
- to_finalize.append(task)
-
- if to_finalize:
- # shortcut, skip the data stager, but fake state transition
- self.advance(to_finalize, state=rps.TMGR_STAGING_OUTPUT,
- publish=True, push=False)
-
- # move to final stata
- for task in to_finalize:
- target_state = task.get('target_state')
- if not target_state:
- target_state = rps.FAILED
- task['state'] = target_state
- self.advance(to_finalize, publish=True, push=False)
-
- return True
-
-
# --------------------------------------------------------------------------
#
def _state_sub_cb(self, topic, msg):
@@ -554,6 +402,8 @@ def _update_tasks(self, task_dicts):
self._tasks[uid]._update(task_dict)
to_notify.append([task, s])
+ self._task_info[uid] = task_dict
+
if to_notify:
if _USE_BULK_CB:
self._bulk_cbs(set([task for task,_ in to_notify]))
@@ -700,6 +550,7 @@ def add_pilots(self, pilots):
pilot.register_callback(self._pilot_state_cb)
pid = pilot_dict['uid']
+
if pid in self._pilots:
raise ValueError('pilot %s already added' % pid)
self._pilots[pid] = pilot_dict
@@ -782,6 +633,36 @@ def remove_pilots(self, pilot_ids, drain=False):
'tmgr' : self.uid}})
+ # --------------------------------------------------------------------------
+ #
+ # FIXME RPC
+ def control_cb(self, topic, msg):
+
+ cmd = msg['cmd']
+ arg = msg['arg']
+
+ if cmd == 'rpc_res':
+
+ self._log.debug('rpc res: %s', arg)
+ self._rpc_queue.put(arg)
+
+
+ # --------------------------------------------------------------------------
+ #
+ def pilot_rpc(self, pid, cmd, args):
+ '''Remote procedure call.
+
+ Send an RPC command and arguments to the pilot and wait for the
+ response. This is a synchronous operation at this point, and it is not
+ thread safe to have multiple concurrent RPC calls.
+ '''
+
+ if pid not in self._pilots:
+ raise ValueError('tmgr does not know pilot %s' % pid)
+
+ return self._pilots[pid].rpc(cmd=cmd, args=args)
+
+
# --------------------------------------------------------------------------
#
def list_units(self):
@@ -954,14 +835,15 @@ def submit_tasks(self, descriptions):
if not descriptions:
return []
- tasks = list()
ret_list = True
if descriptions and not isinstance(descriptions, list):
ret_list = False
descriptions = [descriptions]
# we return a list of tasks
- self._rep.progress_tgt(len(descriptions) + len(tasks), label='submit')
+ tasks = list()
+ ret = list()
+ self._rep.progress_tgt(len(descriptions), label='submit')
for td in descriptions:
mode = td.mode
@@ -978,25 +860,30 @@ def submit_tasks(self, descriptions):
tasks.append(task)
self._rep.progress()
+ if len(tasks) >= 1024:
+ # submit this bulk
+ task_docs = [u.as_dict() for u in tasks]
+ self.advance(task_docs, rps.TMGR_SCHEDULING_PENDING,
+ publish=True, push=True)
+ ret += tasks
+ tasks = list()
+
+ # submit remaining bulk (if any)
+ if tasks:
+ task_docs = [t.as_dict() for t in tasks]
+ self.advance(task_docs, rps.TMGR_SCHEDULING_PENDING,
+ publish=True, push=True)
+ ret += tasks
# keep tasks around
with self._tasks_lock:
- for task in tasks:
+ for task in ret:
self._tasks[task.uid] = task
self._rep.progress_done()
- # insert tasks into the database, as a bulk.
- task_docs = [u.as_dict() for u in tasks]
- self._session._dbs.insert_tasks(task_docs)
-
- # Only after the insert can we hand the tasks over to the next
- # components (ie. advance state).
- self.advance(task_docs, rps.TMGR_SCHEDULING_PENDING,
- publish=True, push=True)
-
- if ret_list: return tasks
- else : return tasks[0]
+ if ret_list: return ret
+ else : return ret[0]
# --------------------------------------------------------------------------
@@ -1011,19 +898,21 @@ def _reconnect_tasks(self):
from .task import Task
from .task_description import TaskDescription
- task_docs = self._session._dbs.get_tasks(tmgr_uid=self.uid)
-
- with self._tasks_lock:
+ # FIXME MongoDB
- for doc in task_docs:
-
- td = TaskDescription(doc['description'])
- td.uid = doc['uid']
-
- task = Task(tmgr=self, descr=td, origin='client')
- task._update(doc, reconnect=True)
-
- self._tasks[task.uid] = task
+ # task_docs = self._session._dbs.get_tasks(tmgr_uid=self.uid)
+ #
+ # with self._tasks_lock:
+ #
+ # for doc in task_docs:
+ #
+ # td = TaskDescription(doc['description'])
+ # td.uid = doc['uid']
+ #
+ # task = Task(tmgr=self, descr=td, origin='client')
+ # task._update(doc, reconnect=True)
+ #
+ # self._tasks[task.uid] = task
# --------------------------------------------------------------------------
@@ -1283,7 +1172,8 @@ def cancel_tasks(self, uids=None):
'tmgr' : self.uid}})
# we also inform all pilots about the cancelation request
- self._session._dbs.pilot_command(cmd='cancel_tasks', arg={'uids':uids})
+ # FIXME: MongoDB
+ # self._session._dbs.pilot_command(cmd='cancel_tasks', arg={'uids':uids})
# In the default case of calling 'advance' above, we just set the state,
# so we *know* tasks are canceled.
diff --git a/src/radical/pilot/tmgr/scheduler/base.py b/src/radical/pilot/tmgr/scheduler/base.py
index 87cce2dd2b..42794711a4 100644
--- a/src/radical/pilot/tmgr/scheduler/base.py
+++ b/src/radical/pilot/tmgr/scheduler/base.py
@@ -70,10 +70,6 @@ def initialize(self):
# don't. Either way, we here subscribe to state updates.
self.register_subscriber(rpc.STATE_PUBSUB, self._base_state_cb)
- # Schedulers use that command channel to get information about
- # pilots being added or removed.
- self.register_subscriber(rpc.CONTROL_PUBSUB, self._base_control_cb)
-
# cache the local client sandbox to avoid repeated os calls
self._client_sandbox = os.getcwd()
@@ -199,7 +195,7 @@ def update_tasks(self, tasks):
# --------------------------------------------------------------------------
#
- def _base_control_cb(self, topic, msg):
+ def control_cb(self, topic, msg):
# we'll wait for commands from the tmgr, to learn about pilots we can
# use or we should stop using. We also track task cancelation, as all
@@ -214,8 +210,8 @@ def _base_control_cb(self, topic, msg):
if cmd not in ['add_pilots', 'remove_pilots', 'cancel_tasks']:
return True
- arg = msg['arg']
- tmgr = arg['tmgr']
+ arg = msg['arg']
+ tmgr = arg['tmgr']
self._log.info('scheduler command: %s: %s' % (cmd, arg))
@@ -306,18 +302,20 @@ def _base_control_cb(self, topic, msg):
to_cancel[pid] = list()
to_cancel[pid].append(uid)
- dbs = self._session._dbs
+ # FIXME: MongoDB
+ # dbs = self._session._dbs
+ dbs = None
if not dbs:
# too late, already closing down
return True
for pid in to_cancel:
- dbs.pilot_command(cmd='cancel_tasks',
- arg={'uids' : to_cancel[pid]},
- pids=pid)
-
- return True
+ # FIXME: MongoDB
+ pass
+ # self._session._dbs.pilot_command(cmd='cancel_tasks',
+ # arg={'uids' : to_cancel[pid]},
+ # pids=pid)
# --------------------------------------------------------------------------
diff --git a/src/radical/pilot/tmgr/staging_input/default.py b/src/radical/pilot/tmgr/staging_input/default.py
index 9fcaccfa41..e4abdce3dc 100644
--- a/src/radical/pilot/tmgr/staging_input/default.py
+++ b/src/radical/pilot/tmgr/staging_input/default.py
@@ -10,7 +10,7 @@
import radical.utils as ru
import radical.saga as rs
-rs.fs = rs.filesystem
+rsfs = rs.filesystem
from ... import states as rps
from ... import constants as rpc
@@ -34,13 +34,13 @@
# ------------------------------------------------------------------------------
#
class Default(TMGRStagingInputComponent):
- """
+ '''
This component performs all tmgr side input staging directives for compute
tasks. It gets tasks from the tmgr_staging_input_queue, in
TMGR_STAGING_INPUT_PENDING state, will advance them to TMGR_STAGING_INPUT
state while performing the staging, and then moves then to the
AGENT_SCHEDULING_PENDING state, passing control to the agent.
- """
+ '''
# --------------------------------------------------------------------------
#
@@ -54,20 +54,19 @@ def __init__(self, cfg, session):
def initialize(self):
# we keep a cache of SAGA dir handles
- self._fs_cache = dict()
- self._js_cache = dict()
- self._pilots = dict()
- self._pilots_lock = ru.RLock()
+ self._fs_cache = dict()
+ self._js_cache = dict()
+ self._pilots = dict()
+ self._pilots_lock = ru.RLock()
+ self._connected = list() # list of pilot conected by ZMQ
+ self._session_sbox = self._reg['cfg.session_sandbox']
self.register_input(rps.TMGR_STAGING_INPUT_PENDING,
rpc.TMGR_STAGING_INPUT_QUEUE, self.work)
- # FIXME: this queue is inaccessible, needs routing via mongodb
- self.register_output(rps.AGENT_STAGING_INPUT_PENDING, None)
-
- # we subscribe to the command channel to learn about pilots being added
- # to this task manager.
- self.register_subscriber(rpc.CONTROL_PUBSUB, self._base_control_cb)
+ # this queue is inaccessible, needs routing via mongodb
+ self.register_output(rps.AGENT_STAGING_INPUT_PENDING,
+ rpc.PROXY_TASK_QUEUE)
self._mkdir_threshold = self.cfg.get('task_bulk_mkdir_threshold',
TASK_BULK_MKDIR_THRESHOLD)
@@ -83,7 +82,7 @@ def finalize(self):
# --------------------------------------------------------------------------
#
- def _base_control_cb(self, topic, msg):
+ def control_cb(self, topic, msg):
# keep track of `add_pilots` commands and updates self._pilots
# accordingly.
@@ -91,30 +90,50 @@ def _base_control_cb(self, topic, msg):
cmd = msg.get('cmd')
arg = msg.get('arg')
- if cmd not in ['add_pilots']:
- self._log.debug('skip cmd %s', cmd)
+ if cmd == 'add_pilots':
+
+ pilots = arg['pilots']
+
+ with self._pilots_lock:
+
+ for pilot in pilots:
+ pid = pilot['uid']
+ self._log.debug('add pilot %s', pid)
+
+ if pid not in self._pilots:
+ self._pilots[pid] = pilot
- pilots = arg.get('pilots', [])
+ elif cmd == 'pilot_register':
- if not isinstance(pilots, list):
- pilots = [pilots]
+ pid = arg['pid']
+ self._log.debug('register pilot %s', pid)
- with self._pilots_lock:
- for pilot in pilots:
- pid = pilot['uid']
- self._log.debug('add pilot %s', pid)
- if pid not in self._pilots:
- self._pilots[pid] = pilot
+ if pid not in self._connected:
+ self._connected.append(pid)
- return True
+ # let pilot know that tasks will arive via ZMQ
+ self.publish(rpc.CONTROL_PUBSUB, msg={'cmd': 'pilot_register_ok',
+ 'arg': {'pid': pid}})
# --------------------------------------------------------------------------
#
- def work(self, tasks):
+ def _advance_tasks(self, tasks, pid=None, state=None, push=True):
+
+ if not state:
+ state = rps.AGENT_STAGING_INPUT_PENDING
+
+ # perform and publish state update
+ # push to the proxy queue
+ for task in tasks:
+ self._log.debug_8('push to proxy: %s', task['uid'])
+
+ self.advance(tasks, state, publish=True, push=push, qname=pid)
- if not isinstance(tasks, list):
- tasks = [tasks]
+
+ # --------------------------------------------------------------------------
+ #
+ def work(self, tasks):
self.advance(tasks, rps.TMGR_STAGING_INPUT, publish=True, push=False)
@@ -122,16 +141,15 @@ def work(self, tasks):
# advance them again as a bulk. We work over the others one by one, and
# advance them individually, to avoid stalling from slow staging ops.
- no_staging_tasks = list()
- staging_tasks = list()
+ session_sbox = self._session_sbox
+ staging_tasks = dict() # pid: [tasks]
+ no_staging_tasks = dict() # pid: [tasks]
for task in tasks:
- # no matter if we perform any staging or not, we will push the full
- # task info to the DB on the next advance, and will pass control to
- # the agent.
- task['$all'] = True
- task['control'] = 'agent_pending'
+ pid = task['pilot']
+ if pid not in staging_tasks : staging_tasks[pid] = list()
+ if pid not in no_staging_tasks: no_staging_tasks[pid] = list()
# check if we have any staging directives to be enacted in this
# component
@@ -141,9 +159,9 @@ def work(self, tasks):
actionables.append(sd)
if actionables:
- staging_tasks.append([task, actionables])
+ staging_tasks[pid].append([task, actionables])
else:
- no_staging_tasks.append(task)
+ no_staging_tasks[pid].append(task)
# Optimization: if we obtained a large bulk of tasks, we at this point
# attempt a bulk mkdir for the task sandboxes, to free the agent of
@@ -168,32 +186,32 @@ def work(self, tasks):
# to do about the pilot configuration (sandbox, access schema, etc), so
# we only attempt this optimization for tasks scheduled to pilots for
# which we learned those details.
- task_sboxes_by_pid = dict()
- for task in no_staging_tasks:
- sbox = task['task_sandbox']
- pid = task['pilot']
- if pid not in task_sboxes_by_pid:
- task_sboxes_by_pid[pid] = list()
- task_sboxes_by_pid[pid].append(sbox)
+ sboxes = dict() # pid: [sboxes]
+ for pid in no_staging_tasks:
+ for task in no_staging_tasks[pid]:
+ if pid not in sboxes:
+ sboxes[pid] = list()
+ sboxes[pid].append(task['task_sandbox'])
# now trigger the bulk mkdir for all filesystems which have more than
# a certain tasks tohandle in this bulk:
- for pid in task_sboxes_by_pid:
+ for pid in sboxes:
with self._pilots_lock:
pilot = self._pilots.get(pid)
if not pilot:
# we don't feel inclined to optimize for unknown pilots
- self._log.debug('pid unknown - skip optimizion', pid)
+ self._log.debug('pid unknown - skip optimization', pid)
continue
- session_sbox = self._session._get_session_sandbox(pilot)
- task_sboxes = task_sboxes_by_pid[pid]
+ task_sboxes = sboxes[pid]
if len(task_sboxes) >= self._mkdir_threshold:
self._log.debug('tar %d sboxes', len(task_sboxes))
+ session_sbox = self._session._get_session_sandbox(pilot)
+
# no matter the bulk mechanism, we need a SAGA handle to the
# remote FS
sbox_fs = ru.Url(session_sbox) # deep copy
@@ -201,7 +219,7 @@ def work(self, tasks):
sbox_fs_str = str(sbox_fs)
if sbox_fs_str not in self._fs_cache:
self._fs_cache[sbox_fs_str] = \
- rs.fs.Directory(sbox_fs, session=self._session)
+ rsfs.Directory(sbox_fs, session=self._session)
saga_dir = self._fs_cache[sbox_fs_str]
# we have two options for a bulk mkdir:
@@ -243,7 +261,7 @@ def work(self, tasks):
type(session_sbox))
self._log.debug('copy: %s -> %s', tar_url, tar_rem_path)
saga_dir.copy(tar_url, tar_rem_path,
- flags=rs.fs.CREATE_PARENTS)
+ flags=rsfs.CREATE_PARENTS)
# get a job service handle to the target resource and run
# the untar command. Use the hop to skip the batch system
@@ -258,33 +276,34 @@ def work(self, tasks):
cmd = "tar xvf %s/%s -C %s" % (session_sbox.path, tar_name,
session_sbox.path)
- j = js_tmp.run_job(cmd)
+ j = js_tmp.run_job(cmd)
j.wait()
self._log.debug('untar : %s', cmd)
- self._log.debug('untar : %s\n---\n%s\n---\n%s',
- j.get_stdout_string(), j.get_stderr_string(),
- j.exit_code)
+ # self._log.debug('untar : %s\n---\n%s\n---\n%s',
+ # j.get_stdout_string(), j.get_stderr_string(),
+ # j.exit_code)
- if no_staging_tasks:
- # nothing to stage, push to the agent
- self.advance(no_staging_tasks, rps.AGENT_STAGING_INPUT_PENDING,
- publish=True, push=True)
+ for pid in no_staging_tasks:
+ if no_staging_tasks[pid]:
+ # nothing to stage, push to the agent
+ self._advance_tasks(no_staging_tasks[pid], pid)
to_fail = list()
- for task,actionables in staging_tasks:
- try:
- self._handle_task(task, actionables)
+ for pid in staging_tasks:
+ for task, actionables in staging_tasks[pid]:
+ try:
+ self._handle_task(task, actionables)
+ self._advance_tasks([task], pid)
- except Exception as e:
- # staging failed - do not pass task to agent
- task['control'] = 'tmgr'
- task['exception'] = repr(e)
- task['exception_detail'] = '\n'.join(ru.get_exception_trace())
- to_fail.append(task)
+ except Exception as e:
+ # staging failed - do not pass task to agent
+ task['control'] = 'tmgr'
+ task['exception'] = repr(e)
+ task['exception_detail'] = '\n'.join(ru.get_exception_trace())
+ to_fail.append(task)
- if to_fail:
- self.advance(to_fail, rps.FAILED, push=False, publish=True)
+ self._advance_tasks(to_fail, state=rps.FAILED, push=False)
# --------------------------------------------------------------------------
@@ -297,7 +316,7 @@ def _handle_task(self, task, actionables):
self._prof.prof("create_sandbox_start", uid=uid)
- src_context = {'pwd' : os.getcwd(), # !!!
+ src_context = {'pwd' : task['client_sandbox'], # !!!
'client' : task['client_sandbox'],
'task' : task['task_sandbox'],
'pilot' : task['pilot_sandbox'],
@@ -323,10 +342,10 @@ def _handle_task(self, task, actionables):
self._log.debug('key %s / %s', key, tmp)
if key not in self._fs_cache:
- self._fs_cache[key] = rs.fs.Directory(tmp, session=self._session)
+ self._fs_cache[key] = rsfs.Directory(tmp, session=self._session)
saga_dir = self._fs_cache[key]
- saga_dir.make_dir(sandbox, flags=rs.fs.CREATE_PARENTS)
+ saga_dir.make_dir(sandbox, flags=rsfs.CREATE_PARENTS)
self._prof.prof("create_sandbox_stop", uid=uid)
# Loop over all transfer directives and filter out tarball staging
@@ -336,6 +355,8 @@ def _handle_task(self, task, actionables):
# create a new actionable list during the filtering
new_actionables = list()
tar_file = None
+ tar_path = None
+ tar_sd = None
for sd in actionables:
@@ -402,13 +423,13 @@ def _handle_task(self, task, actionables):
# Check if the src is a folder, if true
# add recursive flag if not already specified
if os.path.isdir(src.path):
- flags |= rs.fs.RECURSIVE
+ flags |= rsfs.RECURSIVE
# Always set CREATE_PARENTS
- flags |= rs.fs.CREATE_PARENTS
+ flags |= rsfs.CREATE_PARENTS
- src = complete_url(src, src_context, self._log)
- tgt = complete_url(tgt, tgt_context, self._log)
+ src = complete_url(str(src), src_context, self._log)
+ tgt = complete_url(str(tgt), tgt_context, self._log)
self._prof.prof('staging_in_start', uid=uid, msg=did)
saga_dir.copy(src, tgt, flags=flags)
@@ -417,16 +438,15 @@ def _handle_task(self, task, actionables):
if tar_file:
+ assert tar_path
+ assert tar_sd
+
# some tarball staging was done. Add a staging directive for the
# agent to untar the tarball, and clean up.
tar_sd['action'] = rpc.TARBALL
task['description']['input_staging'].append(tar_sd)
os.remove(tar_path)
- # staging is done, we can advance the task at last
- self.advance(task, rps.AGENT_STAGING_INPUT_PENDING,
- publish=True, push=True)
-
# ------------------------------------------------------------------------------
diff --git a/src/radical/pilot/tmgr/staging_output/default.py b/src/radical/pilot/tmgr/staging_output/default.py
index d201518cf5..f973f0f951 100644
--- a/src/radical/pilot/tmgr/staging_output/default.py
+++ b/src/radical/pilot/tmgr/staging_output/default.py
@@ -40,7 +40,9 @@ def initialize(self):
self._cache = dict()
self.register_input(rps.TMGR_STAGING_OUTPUT_PENDING,
- rpc.TMGR_STAGING_OUTPUT_QUEUE, self.work)
+ rpc.PROXY_TASK_QUEUE,
+ qname=self._session.uid,
+ cb=self.work)
# we don't need an output queue -- tasks will be final
@@ -115,7 +117,7 @@ def _handle_task(self, task, actionables):
'session' : task['session_sandbox'],
'resource' : task['resource_sandbox'],
'endpoint' : task['endpoint_fs']}
- tgt_context = {'pwd' : os.getcwd(), # !!!
+ tgt_context = {'pwd' : task['client_sandbox'], # !!!
'client' : task['client_sandbox'],
'task' : task['task_sandbox'],
'pilot' : task['pilot_sandbox'],
diff --git a/src/radical/pilot/utils/__init__.py b/src/radical/pilot/utils/__init__.py
index 4f6f2ab917..b838704efe 100644
--- a/src/radical/pilot/utils/__init__.py
+++ b/src/radical/pilot/utils/__init__.py
@@ -24,19 +24,23 @@
import resource
_limits = list(resource.getrlimit(resource.RLIMIT_NOFILE))
_limits[0] = 512
- resource.setrlimit(resource.RLIMIT_NOFILE, _limits)
+ resource.setrlimit(resource.RLIMIT_NOFILE, tuple(_limits))
+
except:
pass
# ------------------------------------------------------------------------------
#
-from .db_utils import *
-from .prof_utils import *
-from .misc import *
-from .session import *
-from .component import *
-from .serializer import *
+from .db_utils import *
+from .prof_utils import *
+from .misc import *
+from .session import *
+from .component import *
+from .component_manager import *
+from .serializer import *
+from .rpc_helper import *
+
# ------------------------------------------------------------------------------
diff --git a/src/radical/pilot/utils/component.py b/src/radical/pilot/utils/component.py
index a160428f34..23e593ae07 100644
--- a/src/radical/pilot/utils/component.py
+++ b/src/radical/pilot/utils/component.py
@@ -4,15 +4,19 @@
# pylint: disable=global-statement # W0603 global `_components`
+import io
import os
+import sys
import copy
import time
import threading as mt
import radical.utils as ru
-from .. import constants as rpc
-from .. import states as rps
+from .. import constants as rpc
+from .. import states as rps
+
+from ..messages import RPCRequestMessage, RPCResultMessage
# ------------------------------------------------------------------------------
@@ -34,250 +38,6 @@ def _atfork_child():
ru.atfork(ru.noop, ru.noop, _atfork_child)
-# ------------------------------------------------------------------------------
-#
-class ComponentManager(object):
- '''
- RP spans a hierarchy of component instances: the application has a pmgr and
- tmgr, and the tmgr has a staging component and a scheduling component, and
- the pmgr has a launching component, and components also can have bridges,
- etc. This ComponentManager centralises the code needed to spawn, manage and
- terminate such components. Any code which needs to create component should
- create a ComponentManager instance and pass the required component and
- bridge layout and configuration. Callng `stop()` on the cmgr will terminate
- the components and brisged.
- '''
-
- # --------------------------------------------------------------------------
- #
- def __init__(self, cfg):
-
- # register for at-fork hooks
- _components.append(self)
-
- self._cfg = ru.Config('radical.pilot.cmgr', cfg=cfg)
- self._sid = self._cfg.sid
-
- self._uid = ru.generate_id('cmgr.%(item_counter)04d',
- ru.ID_CUSTOM, ns=self._sid)
- self._uids = [self._uid] # uids to track hartbeats for (incl. own)
-
- self._prof = ru.Profiler(self._uid, ns='radical.pilot',
- path=self._cfg.path)
- self._log = ru.Logger(self._uid, ns='radical.pilot',
- path=self._cfg.path)
-
- self._prof.prof('init2', uid=self._uid, msg=self._cfg.path)
-
- # Every ComponentManager runs a HB pubsub bridge in a separate thread.
- # That HB channel should be used by all components and bridges created
- # under this CMGR.
- bcfg = ru.Config(cfg={'channel' : 'heartbeat',
- 'type' : 'pubsub',
- 'uid' : self._uid + '.hb',
- 'stall_hwm' : 1,
- 'bulk_size' : 0,
- 'path' : self._cfg.path})
- self._hb_bridge = ru.zmq.PubSub(bcfg)
- self._hb_bridge.start()
-
- self._cfg.heartbeat.addr_pub = str(self._hb_bridge.addr_pub)
- self._cfg.heartbeat.addr_sub = str(self._hb_bridge.addr_sub)
-
- # runs a HB monitor on that channel
- self._hb = ru.Heartbeat(uid=self.uid,
- timeout=self._cfg.heartbeat.timeout,
- interval=self._cfg.heartbeat.interval,
- beat_cb=self._hb_beat_cb, # on every heartbeat
- term_cb=self._hb_term_cb, # on termination
- log=self._log)
-
- self._hb_pub = ru.zmq.Publisher('heartbeat',
- self._cfg.heartbeat.addr_pub,
- log=self._log, prof=self._prof)
- self._hb_sub = ru.zmq.Subscriber('heartbeat',
- self._cfg.heartbeat.addr_sub,
- topic='heartbeat', cb=self._hb_sub_cb,
- log=self._log, prof=self._prof)
-
- # confirm the bridge being usable by listening to our own heartbeat
- self._hb.start()
- self._hb.wait_startup(self._uid, self._cfg.heartbeat.timeout)
- self._log.info('heartbeat system up')
-
-
- # --------------------------------------------------------------------------
- #
- def _hb_sub_cb(self, topic, msg):
- '''
- keep track of heartbeats for all bridges/components we know
- '''
-
- # self._log.debug('hb_sub %s: get %s check', self.uid, msg['uid'])
- if msg['uid'] in self._uids:
- # self._log.debug('hb_sub %s: get %s used', self.uid, msg['uid'])
- self._hb.beat(uid=msg['uid'])
-
-
- # --------------------------------------------------------------------------
- #
- def _hb_beat_cb(self):
- '''
- publish own heartbeat on the hb channel
- '''
-
- self._hb_pub.put('heartbeat', msg={'uid' : self.uid})
- # self._log.debug('hb_cb %s: put %s', self.uid, self.uid)
-
-
- # --------------------------------------------------------------------------
- #
- def _hb_term_cb(self, uid=None):
-
- self._log.debug('hb_term %s: %s died', self.uid, uid)
- self._prof.prof('term', uid=self._uid)
-
- # FIXME: restart goes here
-
- # NOTE: returning `False` indicates failure to recover. The HB will
- # terminate and suicidally kill the very process it is living in.
- # Make sure all required cleanup is done at this point!
-
- return None
-
-
- # --------------------------------------------------------------------------
- #
- @property
- def uid(self):
- return self._uid
-
-
- # --------------------------------------------------------------------------
- #
- @property
- def cfg(self):
- return self._cfg
-
-
- # --------------------------------------------------------------------------
- #
- def start_bridges(self, cfg=None):
- '''
- check if any bridges are defined under `cfg['bridges']` and start them
- '''
-
- self._prof.prof('start_bridges_start', uid=self._uid)
-
- timeout = self._cfg.heartbeat.timeout
-
- if cfg is None:
- cfg = self._cfg
-
- for bname, bcfg in cfg.get('bridges', {}).items():
-
- bcfg.uid = bname
- bcfg.channel = bname
- bcfg.cmgr = self.uid
- bcfg.sid = cfg.sid
- bcfg.path = cfg.path
- bcfg.heartbeat = cfg.heartbeat
-
- fname = '%s/%s.json' % (cfg.path, bcfg.uid)
- bcfg.write(fname)
-
- self._log.info('create bridge %s [%s]', bname, bcfg.uid)
-
- out, err, ret = ru.sh_callout('radical-pilot-bridge %s' % fname)
- self._log.debug('bridge startup out: %s', out)
- self._log.debug('bridge startup err: %s', err)
- if ret:
- raise RuntimeError('bridge startup failed')
-
- self._uids.append(bcfg.uid)
- self._log.info('created bridge %s [%s]', bname, bcfg.uid)
-
- # all bridges should start now, for their heartbeats
- # to appear.
- # self._log.debug('wait for %s', self._uids)
- failed = self._hb.wait_startup(self._uids, timeout=timeout)
- # self._log.debug('waited for %s: %s', self._uids, failed)
- if failed:
- raise RuntimeError('could not start all bridges %s' % failed)
-
- self._prof.prof('start_bridges_stop', uid=self._uid)
-
-
- # --------------------------------------------------------------------------
- #
- def start_components(self, cfg=None):
- '''
- check if any components are defined under `cfg['components']`
- and start them
- '''
-
- self._prof.prof('start_components_start', uid=self._uid)
-
- timeout = self._cfg.heartbeat.timeout
-
- if cfg is None:
- cfg = self._cfg
-
- # we pass a copy of the complete session config to all components, but
- # merge it into the component specific config settings (no overwrite),
- # and then remove the `bridges` and `components` sections
- #
- scfg = ru.Config(cfg=cfg)
- if 'bridges' in scfg: del scfg['bridges']
- if 'components' in scfg: del scfg['components']
- ru.expand_env(scfg)
-
- for cname, ccfg in cfg.get('components', {}).items():
-
- for _ in range(ccfg.get('count', 1)):
-
- ccfg.uid = ru.generate_id(cname + '.%(item_counter)04d',
- ru.ID_CUSTOM, ns=self._sid)
- ccfg.cmgr = self.uid
- ccfg.kind = cname
- ccfg.sid = cfg.sid
- ccfg.base = cfg.base
- ccfg.path = cfg.path
- ccfg.heartbeat = cfg.heartbeat
-
- ru.dict_merge(ccfg, scfg, policy=ru.PRESERVE, log=self._log)
-
- fname = '%s/%s.json' % (cfg.path, ccfg.uid)
- ccfg.write(fname)
-
- self._log.info('create component %s [%s]', cname, ccfg.uid)
-
- out, err, ret = ru.sh_callout('radical-pilot-component %s' % fname)
- self._log.debug('out: %s' , out)
- self._log.debug('err: %s' , err)
- if ret:
- raise RuntimeError('bridge startup failed')
-
- self._uids.append(ccfg.uid)
- self._log.info('created component %s [%s]', cname, ccfg.uid)
-
- # all components should start now, for their heartbeats
- # to appear.
- failed = self._hb.wait_startup(self._uids, timeout=timeout * 10)
- if failed:
- raise RuntimeError('could not start all components %s' % failed)
-
- self._prof.prof('start_components_stop', uid=self._uid)
-
-
- # --------------------------------------------------------------------------
- #
- def close(self):
-
- self._prof.prof('close', uid=self._uid)
-
- self._hb_bridge.stop()
- self._hb.stop()
# ------------------------------------------------------------------------------
@@ -343,18 +103,11 @@ def __init__(self, cfg, session):
to a file name to be opened as `ru.Config`, or as a pre-populated
`ru.Config` instance). That config MUST contain a session ID (`sid`) for
the session under which to run this component, and a uid for the component
- itself which MUST be unique within the scope of the given session. It MUST
- further contain information about the session's heartbeat ZMQ pubsub channel
- (`hb_pub`, `hb_sub`) on which heartbeats are sent and received for lifetime
- management. All components and the session will continuously sent heartbeat
- messages on that channel - missing heartbeats will by default lead to
- session termination.
-
- The config MAY contain `bridges` and `component` sections. If those exist,
- the component will start the communication bridges and the components
- specified therein, and is then considered an owner of those components and
- bridges. As such, it much watch the HB channel for heartbeats from those
- components, and must terminate itself if those go AWOL.
+ itself which MUST be unique within the scope of the given session.
+
+ All components and the component managers will continuously sent heartbeat
+ messages on the control pubsub - missing heartbeats will by default lead to
+ component termination.
Further, the class must implement the registered work methods, with a
signature of::
@@ -407,6 +160,9 @@ def __init__(self, cfg, session):
constructor.
'''
+ # register for at-fork hooks
+ _components.append(self)
+
# NOTE: a fork will not duplicate any threads of the parent process --
# but it will duplicate any locks which are shared between the
# parent process and its threads -- and those locks might be in
@@ -415,42 +171,39 @@ def __init__(self, cfg, session):
# to create it's own set of locks in self.initialize.
self._cfg = cfg
- self._uid = cfg.uid
+ self._uid = self._cfg.uid
+ self._sid = self._cfg.sid
self._session = session
# we always need an UID
assert self._uid, 'Component needs a uid (%s)' % type(self)
# state we carry over the fork
- self._debug = cfg.get('debug')
- self._owner = cfg.get('owner', self.uid)
- self._ctype = "%s.%s" % (self.__class__.__module__,
- self.__class__.__name__)
- self._number = cfg.get('number', 0)
- self._name = cfg.get('name.%s' % self._number,
- '%s.%s' % (self._ctype, self._number))
-
- self._bridges = list() # communication bridges
- self._components = list() # sub-components
- self._inputs = dict() # queues to get things from
- self._outputs = dict() # queues to send things to
- self._workers = dict() # methods to work on things
- self._publishers = dict() # channels to send notifications to
- self._threads = dict() # subscriber and idler threads
- self._cb_lock = mt.RLock()
- # guard threaded callback invokations
- self._work_lock = mt.RLock()
- # guard threaded callback invokations
-
- self._subscribers = dict() # ZMQ Subscriber classes
+ self._owner = self._cfg.get('owner', self.uid)
+ self._ctype = "%s.%s" % (self.__class__.__module__,
+ self.__class__.__name__)
+
+ self._reg = self._session._reg
+
+ self._inputs = dict() # queues to get things from
+ self._outputs = dict() # queues to send things to
+ self._workers = dict() # methods to work on things
+ self._publishers = dict() # channels to send notifications to
+ self._threads = dict() # subscriber and idler threads
+ self._cb_lock = mt.RLock() # guard threaded callback invokations
+ self._rpc_lock = mt.RLock() # guard threaded rpc calls
+ self._rpc_reqs = dict() # currently active RPC requests
+ self._rpc_handlers = dict() # RPC handler methods
+ self._subscribers = dict() # ZMQ Subscriber classes
if self._owner == self.uid:
self._owner = 'root'
self._prof = self._session._get_profiler(name=self.uid)
- self._rep = self._session._get_reporter(name=self.uid)
self._log = self._session._get_logger (name=self.uid,
- level=self._debug)
+ level=self._cfg.get('log_lvl'),
+ debug=self._cfg.get('debug_lvl'))
+
self._q = None
self._in = None
self._out = None
@@ -465,8 +218,9 @@ def __init__(self, cfg, session):
#
def start(self):
+ # start worker thread
sync = mt.Event()
- self._thread = mt.Thread(target=self._worker_thread, args=[sync])
+ self._thread = mt.Thread(target=self._work_loop, args=[sync])
self._thread.daemon = True
self._thread.start()
@@ -475,21 +229,29 @@ def start(self):
if not self._thread.is_alive():
raise RuntimeError('worker thread died during initialization')
- time.sleep(0.1)
+ time.sleep(0.01)
assert self._thread.is_alive()
# --------------------------------------------------------------------------
#
- def _worker_thread(self, sync):
+ def wait(self):
+
+ while not self._term.is_set():
+ time.sleep(1)
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _work_loop(self, sync):
try:
self._initialize()
except Exception:
self._log.exception('worker thread initialization failed')
- return
+ raise
sync.set()
@@ -520,12 +282,8 @@ def create(cfg, session):
from radical.pilot import pmgr as rppm
from radical.pilot import tmgr as rptm
from radical.pilot import agent as rpa
- from radical.pilot import raptor as rpt
- # from radical.pilot import constants as rpc
comp = {
- rpc.WORKER : rpt.Worker,
- rpc.UPDATE_WORKER : rpw.Update,
rpc.STAGER_WORKER : rpw.Stager,
rpc.PMGR_LAUNCHING_COMPONENT : rppm.Launching,
@@ -541,9 +299,9 @@ def create(cfg, session):
}
- assert cfg.kind in comp, '%s not in %s' % (cfg.kind, list(comp.keys()))
+ assert cfg.kind in comp, '%s not in %s (%s)' % (cfg.kind,
+ list(comp.keys()), cfg.uid)
- session._log.debug('create 1 %s: %s', cfg.kind, comp[cfg.kind])
return comp[cfg.kind].create(cfg, session)
@@ -555,10 +313,23 @@ def __str__(self):
# --------------------------------------------------------------------------
#
- def _cancel_monitor_cb(self, topic, msg):
+ def control_cb(self, topic, msg):
+ '''
+ This callback can be overloaded by the component to handle any control
+ message which is not already handled by the component base class.
+ '''
+ pass
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _control_cb(self, topic, msg):
'''
We listen on the control channel for cancel requests, and append any
- found UIDs to our cancel list.
+ found UIDs to our cancel list. We also listen for RPC requests and
+ handle any registered RPC handlers. All other control messages are
+ passed on to the `control_cb` handler which can be overloaded by
+ component implementations.
'''
# FIXME: We do not check for types of things to cancel - the UIDs are
@@ -566,10 +337,21 @@ def _cancel_monitor_cb(self, topic, msg):
# currently have no abstract 'cancel' command, but instead use
# 'cancel_tasks'.
- # self._log.debug('command incoming: %s', msg)
+ # try to handle message as RPC message
+ try:
+ self._handle_zmq_msg(msg)
+ # handled successfully
+ return
+
+ except:
+ # could not be handled - fall through to legacy handlers
+ pass
+
+ # handle any other message types
+ self._log.debug_5('command incoming: %s', msg)
cmd = msg['cmd']
- arg = msg['arg']
+ arg = msg.get('arg')
if cmd == 'cancel_tasks':
@@ -583,14 +365,145 @@ def _cancel_monitor_cb(self, topic, msg):
with self._cancel_lock:
self._cancel_list += uids
- if cmd == 'terminate':
+ # FIXME RPC: scheduler handles cancelation itself
+ if 'AgentSchedulingComponent' in repr(self):
+ self.control_cb(topic, msg)
+
+ elif cmd == 'terminate':
self._log.info('got termination command')
self.stop()
- # else:
- # self._log.debug('command ignored: %s', cmd)
+ else:
+ self._log.debug_1('command handled by implementation: %s', cmd)
+ self.control_cb(topic, msg)
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _handle_zmq_msg(self, msg_data):
+
+ msg = ru.zmq.Message.deserialize(msg_data)
+
+ if isinstance(msg, RPCRequestMessage):
+ self._log.debug_4('handle rpc request %s', msg)
+ self._handle_rpc_msg(msg)
+
+ elif isinstance(msg, RPCResultMessage):
+
+ if msg.uid in self._rpc_reqs:
+ self._log.debug_4('handle rpc result %s', msg)
+ self._rpc_reqs[msg.uid]['res'] = msg
+ self._rpc_reqs[msg.uid]['evt'].set()
+
+ else:
+ raise ValueError('message type not handled')
- return True
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _handle_rpc_msg(self, msg):
+
+ self._log.debug('handle rpc request: %s', msg)
+
+ bakout = sys.stdout
+ bakerr = sys.stderr
+
+ strout = None
+ strerr = None
+
+ val = None
+ out = None
+ err = None
+ exc = None
+
+ if msg.cmd not in self._rpc_handlers:
+ # this RPC message is *silently* ignored
+ self._log.debug('no rpc handler for [%s]', msg.cmd)
+ return
+
+ rpc_handler, addr = self._rpc_handlers[msg.cmd]
+
+ if msg.addr and msg.addr != addr:
+ self._log.debug('ignore rpc handler for [%s] [%s])', msg, addr)
+ return
+
+ try:
+ self._log.debug('rpc handler for %s: %s',
+ msg.cmd, self._rpc_handlers[msg.cmd])
+
+ sys.stdout = strout = io.StringIO()
+ sys.stderr = strerr = io.StringIO()
+
+ val = rpc_handler(*msg.args, **msg.kwargs)
+ out = strout.getvalue()
+ err = strerr.getvalue()
+
+ except Exception as e:
+ self._log.exception('rpc call failed: %s' % (msg))
+ val = None
+ out = strout.getvalue()
+ err = strerr.getvalue()
+ exc = (repr(e), '\n'.join(ru.get_exception_trace()))
+
+ finally:
+ # restore stdio
+ sys.stdout = bakout
+ sys.stderr = bakerr
+
+ rpc_res = RPCResultMessage(rpc_req=msg, val=val, out=out, err=err, exc=exc)
+ self._log.debug_3('rpc reply: %s', rpc_res)
+
+ self.publish(rpc.CONTROL_PUBSUB, rpc_res)
+
+
+ # --------------------------------------------------------------------------
+ #
+ def register_rpc_handler(self, cmd, handler, addr=None):
+
+ self._rpc_handlers[cmd] = [handler, addr]
+
+
+ # --------------------------------------------------------------------------
+ #
+ def rpc(self, cmd, addr=None, *args, **kwargs):
+ '''Remote procedure call.
+
+ Send am RPC command and arguments to the control pubsub and wait for the
+ response. This is a synchronous operation at this point, and it is not
+ thread safe to have multiple concurrent RPC calls.
+ '''
+
+ self._log.debug_5('rpc call %s(%s, %s)', cmd, args, kwargs)
+
+ rpc_id = ru.generate_id('%s.rpc' % self._uid)
+ rpc_req = RPCRequestMessage(uid=rpc_id, cmd=cmd,
+ args=args, kwargs=kwargs,
+ addr=addr)
+
+ self._rpc_reqs[rpc_id] = {
+ 'req': rpc_req,
+ 'res': None,
+ 'evt': mt.Event(),
+ 'time': time.time(),
+ }
+ self.publish(rpc.CONTROL_PUBSUB, rpc_req)
+
+ while True:
+
+ if not self._rpc_reqs[rpc_id]['evt'].wait(timeout=60):
+ self._log.debug_4('still waiting for rpc request %s', rpc_id)
+ continue
+
+ rpc_res = self._rpc_reqs[rpc_id]['res']
+ self._log.debug_4('rpc result: %s', rpc_res)
+
+ del self._rpc_reqs[rpc_id]
+
+ if rpc_res.exc:
+ raise RuntimeError('rpc failed: %s' % rpc_res.exc)
+
+ return rpc_res.val
# --------------------------------------------------------------------------
@@ -618,15 +531,16 @@ def _initialize(self):
'''
initialization of component base class goes here
'''
+
# components can always publish logs, state updates and control messages
# self.register_publisher(rpc.LOG_PUBSUB)
self.register_publisher(rpc.STATE_PUBSUB)
self.register_publisher(rpc.CONTROL_PUBSUB)
- # set controller callback to handle cancellation requests
+ # set controller callback to handle cancellation requests and RPCs
self._cancel_list = list()
self._cancel_lock = mt.RLock()
- self.register_subscriber(rpc.CONTROL_PUBSUB, self._cancel_monitor_cb)
+ self.register_subscriber(rpc.CONTROL_PUBSUB, self._control_cb)
# call component level initialize
self.initialize()
@@ -683,14 +597,15 @@ def stop(self):
# --------------------------------------------------------------------------
#
- def register_input(self, states, qname, worker, path=None):
+ def register_input(self, states, queue, cb=None, qname=None, path=None):
'''
Using this method, the component can be connected to a queue on which
things are received to be worked upon. The given set of states (which
can be a single state or a list of states) will trigger an assert check
upon thing arrival.
- This method will further associate a thing state with a specific worker.
+ This method will further associate a thing state with a specific worker
+ callback `cb`.
Upon thing arrival, the thing state will be used to lookup the
respective worker, and the thing will be handed over. Workers should
call self.advance(thing), in order to push the thing toward the next
@@ -706,16 +621,20 @@ def register_input(self, states, qname, worker, path=None):
if not states:
states = [None] # worker handles stateless entities
- name = '%s.%s.%s' % (self.uid, worker.__name__,
+ if cb: cbname = cb.__name__
+ else : cbname = 'none'
+
+ name = '%s.%s.%s' % (self.uid, cbname,
'_'.join([str(s) for s in states]))
if name in self._inputs:
raise ValueError('input %s already registered' % name)
- self._inputs[name] = {'queue' : self.get_input_ep(qname, path),
+ self._inputs[name] = {'queue' : self.get_input_ep(queue),
+ 'qname' : qname,
'states' : states}
- self._log.debug('registered input %s', name)
+ self._log.debug('registered input %s [%s] [%s]', name, queue, qname)
# we want exactly one worker associated with a state -- but a worker
# can be responsible for multiple states
@@ -727,9 +646,9 @@ def register_input(self, states, qname, worker, path=None):
if state in self._workers:
self._log.warn("%s replaces worker %s (%s)"
% (self.uid, self._workers[state], state))
- self._workers[state] = worker
+ self._workers[state] = cb
- self._log.debug('registered worker %s [%s]', worker.__name__, state)
+ self._log.debug('registered worker %s [%s]', cbname, state)
# --------------------------------------------------------------------------
@@ -768,7 +687,7 @@ def unregister_input(self, states, qname, worker):
# --------------------------------------------------------------------------
#
- def register_output(self, states, qname, path=None):
+ def register_output(self, states, qname):
'''
Using this method, the component can be connected to a queue to which
things are sent after being worked upon. The given set of states (which
@@ -803,41 +722,32 @@ def register_output(self, states, qname, path=None):
else:
# non-final state, ie. we want a queue to push to:
- self._outputs[state] = self.get_output_ep(qname, path)
+ self._outputs[state] = self.get_output_ep(qname)
# --------------------------------------------------------------------------
#
- def get_input_ep(self, qname, path=None):
+ def get_input_ep(self, qname):
'''
return an input endpoint
'''
- if not path:
- path = self._cfg.path
-
- # dig the addresses from the bridge's config file
- fname = '%s/%s.cfg' % (path, qname)
- cfg = ru.read_json(fname)
+ cfg = self._reg['bridges'][qname]
- return ru.zmq.Getter(qname, url=cfg['get'])
+ self._log.debug('get input ep: %s', qname)
+ return ru.zmq.Getter(qname, url=cfg['addr_get'])
# --------------------------------------------------------------------------
#
- def get_output_ep(self, qname, path=None):
+ def get_output_ep(self, qname):
'''
return an output endpoint
'''
- if not path:
- path = self._cfg.path
-
- # dig the addresses from the bridge's config file
- fname = '%s/%s.cfg' % (path, qname)
- cfg = ru.read_json(fname)
+ cfg = self._reg['bridges'][qname]
- return ru.zmq.Putter(qname, url=cfg['put'])
+ return ru.zmq.Putter(qname, url=cfg['addr_put'])
# --------------------------------------------------------------------------
@@ -944,7 +854,7 @@ def run(self):
if self._timeout and \
self._timeout > (time.time() - self._last):
# not yet
- time.sleep(0.1) # FIXME: make configurable
+ time.sleep(0.01) # FIXME: make configurable
continue
with self._cb_lock:
@@ -992,23 +902,17 @@ def unregister_timed_cb(self, cb):
# --------------------------------------------------------------------------
#
- def register_publisher(self, pubsub, path=None):
+ def register_publisher(self, pubsub):
'''
Using this method, the component can registered itself to be a publisher
of notifications on the given pubsub channel.
'''
- if not path:
- path = self._cfg.path
-
assert pubsub not in self._publishers
-
- # dig the addresses from the bridge's config file
- fname = '%s/%s.cfg' % (path, pubsub)
- cfg = ru.read_json(fname)
+ cfg = self._reg['bridges.%s' % pubsub]
self._publishers[pubsub] = ru.zmq.Publisher(channel=pubsub,
- url=cfg['pub'],
+ url=cfg['addr_pub'],
log=self._log,
prof=self._prof)
@@ -1017,7 +921,7 @@ def register_publisher(self, pubsub, path=None):
# --------------------------------------------------------------------------
#
- def register_subscriber(self, pubsub, cb, path=None):
+ def register_subscriber(self, pubsub, cb):
'''
This method is complementary to the register_publisher() above: it
registers a subscription to a pubsub channel. If a notification
@@ -1034,16 +938,11 @@ def register_subscriber(self, pubsub, cb, path=None):
invocation.
'''
- if not path:
- path = self._cfg.path
-
- # dig the addresses from the bridge's config file
- fname = '%s/%s.cfg' % (path, pubsub)
- cfg = ru.read_json(fname)
+ cfg = self._reg['bridges'][pubsub]
if pubsub not in self._subscribers:
self._subscribers[pubsub] = ru.zmq.Subscriber(channel=pubsub,
- url=cfg['sub'],
+ url=cfg['addr_sub'],
log=self._log,
prof=self._prof)
@@ -1068,20 +967,28 @@ def work_cb(self):
time.sleep(0.1)
return True
+ # TODO: should a poller over all inputs, or better yet register
+ # a callback
+
for name in self._inputs:
- qname = self._inputs[name]['queue']
+ qname = self._inputs[name]['qname']
+ queue = self._inputs[name]['queue']
states = self._inputs[name]['states']
# FIXME: a simple, 1-thing caching mechanism would likely
# remove the req/res overhead completely (for any
# non-trivial worker).
- things = qname.get_nowait(500) # in microseconds
+ things = queue.get_nowait(qname=qname, timeout=200) # microseconds
+ # self._log.debug('work_cb %s: %s %s %d', name, queue.channel,
+ # qname, len(things))
things = ru.as_list(things)
if not things:
- # return to have a chance to catch term signals
- return True
+ # next input
+ continue
+
+ # self._log.debug('work_cb: %d', len(things))
# the worker target depends on the state of things, so we
# need to sort the things into buckets by state before
@@ -1089,8 +996,6 @@ def work_cb(self):
buckets = dict()
for thing in things:
state = thing.get('state') # can be stateless
- uid = thing.get('uid') # and not have uids
- self._prof.prof('get', uid=uid, state=state)
if state not in buckets:
buckets[state] = list()
@@ -1100,32 +1005,35 @@ def work_cb(self):
for state,things in buckets.items():
- assert state in states, 'cannot handle state %s' % state
+ assert state in states, 'cannot handle state %s' % state
assert state in self._workers, 'no worker for state %s' % state
try:
- to_cancel = list()
-
- for thing in things:
-
- uid = thing.get('uid')
+ # filter out canceled things
+ if self._cancel_list:
# FIXME: this can become expensive over time
# if the cancel list is never cleaned
- if uid and uid in self._cancel_list:
- with self._cancel_lock:
- self._cancel_list.remove(uid)
- to_cancel.append(thing)
+ to_cancel = list()
+ with self._cancel_lock:
+ if thing['uid'] in self._cancel_list:
+ to_cancel.append(thing)
+
+ self._cancel_list = [x for x in self._cancel_list
+ if x not in to_cancel]
- self._log.debug('got %s (%s)', uid, state)
+ if to_cancel:
+ # only advance stateful entities, otherwise just drop
+ if state:
+ self.advance(to_cancel, rps.CANCELED,
+ publish=True, push=False)
- if to_cancel:
- # only advance stateful entities, otherwise just drop
- if state:
- self.advance(to_cancel, rps.CANCELED, publish=True,
- push=False)
- with self._work_lock:
- self._workers[state](things)
+
+ # self._log.debug('== got %d things (%s)', len(things), state)
+ # for thing in things:
+ # self._log.debug('got %s (%s)', thing['uid'], state)
+
+ self._workers[state](things)
except Exception as e:
@@ -1147,8 +1055,8 @@ def work_cb(self):
# --------------------------------------------------------------------------
#
- def advance(self, things, state=None, publish=True, push=False, ts=None,
- prof=True):
+ def advance(self, things, state=None, publish=True, push=False, qname=None,
+ ts=None, fwd=False, prof=True):
'''
Things which have been operated upon are pushed down into the queues
again, only to be picked up by the next component, according to their
@@ -1215,10 +1123,9 @@ def advance(self, things, state=None, publish=True, push=False, ts=None,
# If '$set' is set, we also publish all keys listed in there.
# In all other cases, we only send 'uid', 'type' and 'state'.
for thing in things:
+
if '$all' in thing:
del thing['$all']
- if '$set' in thing:
- del thing['$set']
to_publish.append(thing)
elif thing['state'] in rps.FINAL:
@@ -1228,13 +1135,11 @@ def advance(self, things, state=None, publish=True, push=False, ts=None,
tmp = {'uid' : thing['uid'],
'type' : thing['type'],
'state' : thing['state']}
- if '$set' in thing:
- for key in thing['$set']:
- tmp[key] = thing[key]
- del thing['$set']
to_publish.append(tmp)
- self.publish(rpc.STATE_PUBSUB, {'cmd': 'update', 'arg': to_publish})
+ self.publish(rpc.STATE_PUBSUB, {'cmd': 'update',
+ 'arg': to_publish,
+ 'fwd': fwd})
# ts = time.time()
# for thing in things:
@@ -1258,7 +1163,7 @@ def advance(self, things, state=None, publish=True, push=False, ts=None,
if _state in rps.FINAL:
# things in final state are dropped
for thing in _things:
- self._log.debug('final %s [%s]', thing['uid'], _state)
+ # self._log.debug('final %s [%s]', thing['uid'], _state)
self._prof.prof('drop', uid=thing['uid'], state=_state,
ts=ts)
continue
@@ -1266,8 +1171,8 @@ def advance(self, things, state=None, publish=True, push=False, ts=None,
if _state not in self._outputs:
# unknown target state -- error
for thing in _things:
- self._log.debug("lost %s [%s] : %s", thing['uid'],
- _state, self._outputs)
+ # self._log.debug("lost %s [%s] : %s", thing['uid'],
+ # _state, self._outputs)
self._prof.prof('lost', uid=thing['uid'], state=_state,
ts=ts)
continue
@@ -1275,7 +1180,7 @@ def advance(self, things, state=None, publish=True, push=False, ts=None,
if not self._outputs[_state]:
# empty output -- drop thing
for thing in _things:
- self._log.debug('drop %s [%s]', thing['uid'], _state)
+ # self._log.debug('drop %s [%s]', thing['uid'], _state)
self._prof.prof('drop', uid=thing['uid'], state=_state,
ts=ts)
continue
@@ -1283,18 +1188,19 @@ def advance(self, things, state=None, publish=True, push=False, ts=None,
output = self._outputs[_state]
# push the thing down the drain
- self._log.debug('put bulk %s: %s', _state, len(_things))
- output.put(_things)
+ self._log.debug('put bulk %s: %s: %s', _state, len(_things),
+ output.channel)
+ output.put(_things, qname=qname)
- ts = time.time()
- for thing in _things:
- self._prof.prof('put', uid=thing['uid'], state=_state,
- msg=output.name, ts=ts)
+ # ts = time.time()
+ # for thing in _things:
+ # self._prof.prof('put', uid=thing['uid'], state=_state,
+ # msg=output.name, ts=ts)
# --------------------------------------------------------------------------
#
- def publish(self, pubsub, msg):
+ def publish(self, pubsub, msg, topic=None):
'''
push information into a publication channel
'''
@@ -1302,7 +1208,10 @@ def publish(self, pubsub, msg):
if not self._publishers.get(pubsub):
raise RuntimeError("no msg route for '%s': %s" % (pubsub, msg))
- self._publishers[pubsub].put(pubsub, msg)
+ if not topic:
+ topic = pubsub
+
+ self._publishers[pubsub].put(topic, msg)
# ------------------------------------------------------------------------------
diff --git a/src/radical/pilot/utils/component_manager.py b/src/radical/pilot/utils/component_manager.py
new file mode 100644
index 0000000000..3319e03f4d
--- /dev/null
+++ b/src/radical/pilot/utils/component_manager.py
@@ -0,0 +1,223 @@
+
+__copyright__ = 'Copyright 2023, The RADICAL-Cybertools Team'
+__license__ = 'MIT'
+
+# pylint: disable=global-statement # W0603 global `_components`
+
+import os
+import copy
+import time
+
+import threading as mt
+import radical.utils as ru
+
+from .. import constants as rpc
+from .. import states as rps
+from ..messages import HeartbeatMessage
+
+
+# ------------------------------------------------------------------------------
+#
+class ComponentManager(object):
+ '''
+ RP spans a hierarchy of component instances: the application has a pmgr and
+ tmgr, and the tmgr has a staging component and a scheduling component, and
+ the pmgr has a launching component, and components also can have bridges,
+ etc. This ComponentManager centralises the code needed to spawn, manage and
+ terminate such components. Any code which needs to create component should
+ create a ComponentManager instance and pass the required component and
+ bridge layout and configuration. Callng `stop()` on the cmgr will terminate
+ the components and brisged.
+ '''
+
+ # --------------------------------------------------------------------------
+ #
+ def __init__(self, sid, reg_addr, owner):
+
+ # create a registry client to obtain the session config and to store
+ # component and bridge configs
+
+ self._sid = sid
+ self._reg_addr = reg_addr
+ self._owner = owner
+
+ self._reg = ru.zmq.RegistryClient(url=self._reg_addr)
+ self._cfg = ru.Config(from_dict=self._reg['cfg'])
+ self._hb_cfg = ru.Config(from_dict=self._reg['heartbeat'])
+
+ self._uid = ru.generate_id('cmgr.%(item_counter)04d',
+ ru.ID_CUSTOM, ns=self._sid)
+
+ self._prof = ru.Profiler(self._uid, ns='radical.pilot',
+ path=self._cfg.path)
+ self._log = ru.Logger(self._uid, ns='radical.pilot',
+ path=self._cfg.path,
+ level=self._cfg.log_lvl,
+ debug=self._cfg.debug_lvl)
+
+ self._prof.prof('init2', uid=self._uid, msg=self._cfg.path)
+
+ self._log.debug('cmgr %s (%s)', self._uid, self._owner)
+
+ # component managers listen on the heartbeat pubsub to see if spawned
+ # components come alive
+ self._heartbeats = dict() # heartbeats we have seen
+ ru.zmq.Subscriber(channel='heartbeat_pubsub',
+ topic='heartbeat',
+ url=self._hb_cfg.addr_sub,
+ cb=self._hb_msg_cb,
+ log=self._log,
+ prof=self._prof)
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _hb_msg_cb(self, topic, msg):
+
+ hb_msg = HeartbeatMessage(from_dict=msg)
+ self._heartbeats[hb_msg.uid] = time.time()
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _wait_startup(self, uids, timeout):
+ '''
+ Wait for the first heartbeat of the given component UIDs to appear. If
+ that does not happen before timeout, an exception is raised.
+ '''
+
+ start = time.time()
+ ok = list()
+ nok = uids
+ while True:
+
+ self._log.debug('wait for : %s', nok)
+
+ ok = [uid for uid in uids if uid in self._heartbeats]
+ nok = [uid for uid in uids if uid not in ok]
+
+ if len(ok) == len(uids):
+ break
+
+ if time.time() - start > timeout:
+ self._log.debug('wait failed: %s', nok)
+ raise RuntimeError('uids %s not found' % nok)
+
+ time.sleep(0.25)
+
+ self._log.debug('wait for done: %s', ok)
+
+
+ # --------------------------------------------------------------------------
+ #
+ @property
+ def uid(self):
+ return self._uid
+
+
+ # --------------------------------------------------------------------------
+ #
+ def start_bridges(self, bridges):
+
+ self._prof.prof('start_bridges_start', uid=self._uid)
+
+ buids = list()
+ for bname, bcfg in bridges.items():
+
+ uid = bname
+ buids.append(uid)
+
+ bcfg.uid = uid
+ bcfg.channel = bname
+ bcfg.cmgr = self.uid
+ bcfg.owner = self._owner
+ bcfg.sid = self._cfg.sid
+ bcfg.path = self._cfg.path
+ bcfg.reg_addr = self._cfg.reg_addr
+ bcfg.log_lvl = self._cfg.log_lvl
+ bcfg.debug_lvl = self._cfg.debug_lvl
+ bcfg.heartbeat = self._hb_cfg
+
+ self._reg['bridges.%s.cfg' % bname] = bcfg
+
+ # self._reg.put('bridge.%s' % bname, bcfg)
+ cmd = 'radical-pilot-bridge %s %s %s' \
+ % (self._sid, self._reg.url, bname)
+
+ out, err, ret = ru.sh_callout(cmd, cwd=self._cfg.path)
+
+ if ret:
+ raise RuntimeError('bridge startup failed')
+
+ self._heartbeats[bname] = None
+ self._log.info('created bridge %s [%s]', bname, bname)
+
+ # all bridges are started, wait for their heartbeats
+ self._log.debug('wait for %s', buids)
+ self._wait_startup(buids, timeout=self._hb_cfg.timeout)
+
+ self._prof.prof('start_bridges_stop', uid=self._uid)
+
+
+ # --------------------------------------------------------------------------
+ #
+ def start_components(self, components, cfg = None):
+
+ self._prof.prof('start_components_start: %s', uid=self._uid)
+
+ cuids = list()
+ for cname, ccfg in components.items():
+
+ for _ in range(ccfg.get('count', 1)):
+
+ uid = ru.generate_id(cname + '.%(item_counter)04d',
+ ru.ID_CUSTOM, ns=self._sid)
+ cuids.append(uid)
+
+ ccfg.uid = uid
+ ccfg.kind = cname
+ ccfg.owner = self._owner
+ ccfg.sid = self._cfg.sid
+ ccfg.cmgr = self._cfg.uid
+ ccfg.base = self._cfg.base
+ ccfg.path = self._cfg.path
+ ccfg.reg_addr = self._cfg.reg_addr
+ ccfg.proxy_url = self._cfg.proxy_url
+ ccfg.log_lvl = self._cfg.log_lvl
+ ccfg.debug_lvl = self._cfg.debug_lvl
+ ccfg.heartbeat = self._hb_cfg
+
+ if cfg:
+ ru.dict_merge(ccfg, cfg, ru.OVERWRITE)
+
+ self._reg['components.%s.cfg' % uid] = ccfg
+
+ self._log.info('create component %s [%s]', cname, uid)
+
+ cmd = 'radical-pilot-component %s %s %s' \
+ % (self._sid, self._reg.url, uid)
+ out, err, ret = ru.sh_callout(cmd, cwd=self._cfg.path)
+
+ self._log.debug('component startup out: %s' , out)
+ self._log.debug('component startup err: %s' , err)
+
+ if ret:
+ raise RuntimeError('component startup failed')
+
+ self._log.info('created component %s [%s]', cname, uid)
+
+ # all components should start now, wait for heartbeats to appear.
+ self._log.debug('wait for %s', cuids)
+ self._wait_startup(cuids, timeout=self._hb_cfg.timeout)
+
+ self._prof.prof('start_components_stop', uid=self._uid)
+
+
+ # --------------------------------------------------------------------------
+ #
+ def close(self):
+
+ self._prof.prof('close', uid=self._uid)
+
+
+# ------------------------------------------------------------------------------
diff --git a/src/radical/pilot/utils/db_utils.py b/src/radical/pilot/utils/db_utils.py
index 35b181afdb..e4a5730c54 100644
--- a/src/radical/pilot/utils/db_utils.py
+++ b/src/radical/pilot/utils/db_utils.py
@@ -1,9 +1,9 @@
-import datetime
-import json
import os
import sys
+import json
import time
+import datetime
import radical.utils as ru
@@ -18,7 +18,7 @@ def bson2json (bson_data):
# http://stackoverflow.com/questions/16586180/ \
# typeerror-objectid-is-not-json-serializable
- from bson.objectid import ObjectId
+ from bson.objectid import ObjectId
class MyJSONEncoder (json.JSONEncoder):
def default (self, o):
diff --git a/src/radical/pilot/utils/misc.py b/src/radical/pilot/utils/misc.py
index 872a91844d..9578928253 100644
--- a/src/radical/pilot/utils/misc.py
+++ b/src/radical/pilot/utils/misc.py
@@ -4,7 +4,7 @@
import os
import time
-from typing import Union
+from typing import List, Union
import radical.utils as ru
@@ -52,7 +52,7 @@ def get_rusage() -> str:
# ------------------------------------------------------------------------------
#
-def create_tar(tgt: str, dnames: str) -> None:
+def create_tar(tgt: str, dnames: List[str]) -> None:
'''
Create a tarball on the file system which contains all given directories
'''
@@ -180,10 +180,10 @@ def get_resource_fs_url(resource: str,
rcfg = get_resource_config(resource)
if not schema:
- schema = rcfg['schemas'][0]
+ schema = rcfg['default_schema']
# return a deep copy
- return ru.Url(rcfg[schema]['filesystem_endpoint'])
+ return ru.Url(rcfg['schemas'][schema]['filesystem_endpoint'])
# ------------------------------------------------------------------------------
@@ -213,10 +213,10 @@ def get_resource_job_url(resource: str,
rcfg = get_resource_config(resource)
if not schema:
- schema = rcfg['schemas'][0]
+ schema = rcfg['default_schema']
# return a deep copy
- return ru.Url(rcfg[schema]['job_manager_endpoint'])
+ return ru.Url(rcfg.schemas[schema]['job_manager_endpoint'])
# ------------------------------------------------------------------------------
diff --git a/src/radical/pilot/utils/prof_utils.py b/src/radical/pilot/utils/prof_utils.py
index 78be272952..e9c4e41511 100644
--- a/src/radical/pilot/utils/prof_utils.py
+++ b/src/radical/pilot/utils/prof_utils.py
@@ -7,7 +7,6 @@
from .. import states as s
from ..task_description import RAPTOR_MASTER, RAPTOR_WORKER, TASK_EXECUTABLE
-from .session import fetch_json
_debug = os.environ.get('RP_PROF_DEBUG')
_node_index = dict()
@@ -435,7 +434,7 @@ def get_hostmap(profile):
'''
We abuse the profile combination to also derive a pilot-host map, which
will tell us on what exact host each pilot has been running. To do so, we
- check for the PMGR_ACTIVE advance event in agent.0.prof, and use the NTP
+ check for the PMGR_ACTIVE advance event in agent_0.prof, and use the NTP
sync info to associate a hostname.
'''
# FIXME: This should be replaced by proper hostname logging
@@ -471,7 +470,7 @@ def get_hostmap_deprecated(profiles):
for row in prof:
- if 'agent.0.prof' in pname and \
+ if 'agent_0.prof' in pname and \
row[ru.EVENT] == 'advance' and \
row[ru.STATE] == s.PMGR_ACTIVE:
hostmap[row[ru.UID]] = host_id
@@ -550,7 +549,7 @@ def get_session_profile(sid, src=None):
# ------------------------------------------------------------------------------
#
-def get_session_description(sid, src=None, dburl=None):
+def get_session_description(sid, src=None):
'''
This will return a description which is usable for radical.analytics
evaluation. It informs about:
@@ -562,43 +561,40 @@ def get_session_description(sid, src=None, dburl=None):
If `src` is given, it is interpreted as path to search for session
information (json dump). `src` defaults to `$PWD/$sid`.
-
- if `dburl` is given, its value is used to fetch session information from a
- database. The dburl value defaults to `RADICAL_PILOT_DBURL`.
-
'''
if not src:
src = '%s/%s' % (os.getcwd(), sid)
- if os.path.isfile('%s/%s.json' % (src, sid)):
- json = ru.read_json('%s/%s.json' % (src, sid))
- else:
- ftmp = fetch_json(sid=sid, dburl=dburl, tgt=src, skip_existing=True)
- json = ru.read_json(ftmp)
-
- # make sure we have uids
- # FIXME v0.47: deprecate
- def fix_json(json):
- def fix_uids(json):
- if isinstance(json, list):
- for elem in json:
- fix_uids(elem)
- elif isinstance(json, dict):
- if 'taskmanager' in json and 'tmgr' not in json:
- json['tmgr'] = json['taskmanager']
- if 'pilotmanager' in json and 'pmgr' not in json:
- json['pmgr'] = json['pilotmanager']
- if '_id' in json and 'uid' not in json:
- json['uid'] = json['_id']
- if 'cfg' not in json:
- json['cfg'] = dict()
- for v in json.values():
- fix_uids(v)
- fix_uids(json)
- fix_json(json)
-
- assert sid == json['session'][0]['uid'], 'sid inconsistent'
+ # construct session json from registry dump, tmgr and pmgr json files, and
+ # pilot and task json files
+
+ json = dict()
+
+ reg = ru.read_json('%s/%s.reg.json' % (src, sid))
+ del reg['rcfgs']
+
+ json['session'] = [ reg ]
+ json['tmgr'] = list()
+ json['pmgr'] = list()
+ json['pilot'] = list()
+ json['task'] = list()
+
+ for fname in glob.glob(str('%s/tmgr.*.json' % src)):
+ json['tmgr'].append(ru.read_json(fname))
+
+ for fname in glob.glob(str('%s/pmgr.*.json' % src)):
+ json['pmgr'].append(ru.read_json(fname))
+
+ for tmgr in json['tmgr']:
+ json['task'].extend(tmgr['tasks'].values())
+ del tmgr['tasks']
+
+ for pmgr in json['pmgr']:
+ json['pilot'].extend(pmgr['pilots'])
+ del pmgr['pilots']
+
+ json['session'][0]['uid'] = sid
ret = dict()
ret['entities'] = dict()
@@ -616,7 +612,7 @@ def fix_uids(json):
tree[sid]['children'].append(uid)
tree[uid] = {'uid' : uid,
'etype' : 'pmgr',
- 'cfg' : pmgr['cfg'],
+ 'cfg' : pmgr.get('cfg', {}),
'has' : ['pilot'],
'children' : list()
}
@@ -626,7 +622,7 @@ def fix_uids(json):
tree[sid]['children'].append(uid)
tree[uid] = {'uid' : uid,
'etype' : 'tmgr',
- 'cfg' : tmgr['cfg'],
+ 'cfg' : tmgr.get('cfg', {}),
'has' : ['task'],
'children' : list()
}
@@ -634,11 +630,19 @@ def fix_uids(json):
tree[uid]['description'] = dict()
for pilot in sorted(json['pilot'], key=lambda k: k['uid']):
- uid = pilot['uid']
- pmgr = pilot['pmgr']
- pilot['cfg']['resource_details'] = pilot['resource_details']
- tree[pmgr]['children'].append(uid)
- tree[uid] = {'uid' : uid,
+
+ pid = pilot['uid']
+ pmgr = pilot['pmgr']
+
+ details = pilot['description']
+ details = ru.dict_merge(details, pilot['resource_details'])
+
+ pilot['cfg'] = details
+ pilot['cfg']['resource_details'] = details
+ pilot['cfg']['resource_details']['rm_info'] = details
+
+ tree[pmgr]['children'].append(pid)
+ tree[pid] = {'uid' : pid,
'etype' : 'pilot',
'cfg' : pilot['cfg'],
'resources' : pilot['resources'],
diff --git a/src/radical/pilot/utils/rpc_helper.py b/src/radical/pilot/utils/rpc_helper.py
new file mode 100644
index 0000000000..f046e83653
--- /dev/null
+++ b/src/radical/pilot/utils/rpc_helper.py
@@ -0,0 +1,166 @@
+
+__copyright__ = 'Copyright 2023, The RADICAL-Cybertools Team'
+__license__ = 'MIT'
+
+import io
+import sys
+import queue
+
+import threading as mt
+
+import radical.utils as ru
+
+from ..constants import CONTROL_PUBSUB
+from ..messages import RPCRequestMessage, RPCResultMessage
+
+
+# ------------------------------------------------------------------------------
+#
+class RPCHelper(object):
+ '''
+ This class implements a simple synchronous RPC mechanism. It only requires
+ the addresses of the control pubsub to use.
+ '''
+
+
+ # --------------------------------------------------------------------------
+ #
+ def __init__(self, owner, ctrl_addr_pub, ctrl_addr_sub, log, prof):
+
+ self._owner = owner # used for uid scope
+ self._addr_pub = ctrl_addr_pub
+ self._addr_sub = ctrl_addr_sub
+
+ self._log = log
+ self._prof = prof
+
+ self._active = None
+ self._queue = queue.Queue()
+ self._lock = mt.Lock()
+ self._handlers = dict()
+
+ self._pub = ru.zmq.Publisher(channel=CONTROL_PUBSUB,
+ url=self._addr_pub,
+ log=self._log,
+ prof=self._prof)
+
+ self._thread = mt.Thread(target=self._work)
+ self._thread.daemon = True
+ self._thread.start()
+
+
+ # --------------------------------------------------------------------------
+ #
+ def request(self, cmd, *args, **kwargs):
+
+ rid = ru.generate_id('%s.rpc' % self._owner)
+ req = RPCRequestMessage(uid=rid, cmd=cmd, args=args, kwargs=kwargs)
+
+ self._active = rid
+
+ self._pub.put(CONTROL_PUBSUB, req)
+ self._log.debug_3('sent rpc req %s', req)
+
+ res = self._queue.get()
+
+ assert res.uid == req.uid
+
+ if res.exc:
+ # FIXME: try to deserialize exception type
+ # this should work at least for standard exceptions
+ raise RuntimeError(str(res.exc))
+
+ return res
+
+
+ # --------------------------------------------------------------------------
+ #
+ def _work(self):
+
+ pub = ru.zmq.Publisher(channel=CONTROL_PUBSUB,
+ url=self._addr_pub,
+ log=self._log,
+ prof=self._prof)
+
+ sub = ru.zmq.Subscriber(channel=CONTROL_PUBSUB,
+ topic=CONTROL_PUBSUB,
+ url=self._addr_sub,
+ log=self._log,
+ prof=self._prof)
+ sub.subscribe(CONTROL_PUBSUB)
+
+ import time
+ time.sleep(1)
+
+ while True:
+
+ data = sub.get_nowait(100)
+ if not data or data == [None, None]:
+ continue
+
+ msg_topic = data[0]
+ msg_data = data[1]
+
+ if not isinstance(msg_data, dict):
+ continue
+
+ try:
+ msg = ru.zmq.Message.deserialize(msg_data)
+
+ except Exception as e:
+ # not a `ru.zmq.Message` type
+ continue
+
+ if isinstance(msg, RPCRequestMessage):
+
+ # handle any RPC requests for which a handler is registered
+ self._log.debug_2('got rpc req: %s', msg)
+
+ with self._lock:
+ if msg.cmd in self._handlers:
+ rep = self.handle_request(msg)
+ pub.put(CONTROL_PUBSUB, rep)
+ else:
+ self._log.debug_2('no rpc handler for %s', msg.cmd)
+
+ elif isinstance(msg, RPCResultMessage):
+
+ # collect any RPC response whose uid matches the one we wait for
+
+ self._log.debug_2('got rpc res', self._active, msg.uid)
+ if self._active and msg.uid == self._active:
+ self._active = None
+ self._queue.put(msg)
+
+
+ # --------------------------------------------------------------------------
+ #
+ def add_handler(self, cmd, handler):
+ '''
+ register a handler for the specified rpc command type
+ '''
+
+ with self._lock:
+
+ if cmd in self._handlers:
+ raise ValueError('handler for rpc cmd %s already set' % cmd)
+
+ self._handlers[cmd] = handler
+
+
+ # --------------------------------------------------------------------------
+ #
+ def del_handler(self, cmd):
+ '''
+ unregister a handler for the specified rpc command type
+ '''
+
+ with self._lock:
+
+ if cmd not in self._handlers:
+ raise ValueError('handler for rpc cmd %s not set' % cmd)
+
+ del self._handlers[cmd]
+
+
+# ------------------------------------------------------------------------------
diff --git a/src/radical/pilot/utils/session.py b/src/radical/pilot/utils/session.py
index 650fa97766..e9dbbe2017 100644
--- a/src/radical/pilot/utils/session.py
+++ b/src/radical/pilot/utils/session.py
@@ -13,61 +13,7 @@
# ------------------------------------------------------------------------------
#
-def fetch_json(sid, dburl=None, tgt=None, skip_existing=False, session=None,
- log=None):
- '''
- Returns:
-
- file name.
-
- '''
-
- if not log and session:
- log = session._log
- elif not log:
- log = ru.Logger('radical.pilot.utils')
-
- if session:
- rep = session._rep
- else:
- rep = ru.Reporter('radical.pilot.utils')
-
- if not tgt:
- tgt = os.getcwd()
-
- if tgt.startswith('/'):
- dst = '%s/%s/%s.json' % (tgt, sid, sid)
- else:
- dst = '%s/%s/%s/%s.json' % (os.getcwd(), tgt, sid, sid)
-
- ru.rec_makedir(os.path.dirname(dst))
-
- if skip_existing and os.path.isfile(dst) and os.path.getsize(dst):
- log.info("session already in %s", dst)
- return dst
-
- # need to fetch from MongoDB
- if not dburl:
- dburl = os.environ.get('RADICAL_PILOT_DBURL')
-
- if not dburl:
- raise ValueError('need RADICAL_PILOT_DBURL to fetch session')
-
- mongo, db, _, _, _ = ru.mongodb_connect(dburl)
-
- json_docs = get_session_docs(sid, db)
- ru.write_json(json_docs, dst)
- mongo.close()
-
- log.info("session written to %s", dst)
- rep.ok("+ %s (json)\n" % sid)
-
- return dst
-
-
-# ------------------------------------------------------------------------------
-#
-def fetch_filetype(ext, name, sid, dburl=None, src=None, tgt=None, access=None,
+def fetch_filetype(ext, name, sid, src=None, tgt=None, access=None,
session=None, skip_existing=False, fetch_client=False, log=None):
'''
Args:
@@ -143,9 +89,11 @@ def fetch_filetype(ext, name, sid, dburl=None, src=None, tgt=None, access=None,
rs_file.close()
# we need the session json for pilot details
- json_name = fetch_json(sid, dburl, tgt, skip_existing, session, log)
- json_docs = ru.read_json(json_name)
- pilots = json_docs['pilot']
+ pilots = list()
+ for fname in glob.glob('%s/pmgr.*.json' % sid):
+ json_doc = ru.read_json(fname)
+ pilots.extend(json_doc['pilots'])
+
num_pilots = len(pilots)
log.debug("Session: %s", sid)
@@ -246,19 +194,19 @@ def fetch_filetype(ext, name, sid, dburl=None, src=None, tgt=None, access=None,
# ------------------------------------------------------------------------------
#
-def fetch_profiles (sid, dburl=None, src=None, tgt=None, access=None,
+def fetch_profiles (sid, src=None, tgt=None, access=None,
session=None, skip_existing=False, fetch_client=False, log=None):
- return fetch_filetype('prof', 'profiles', sid, dburl, src, tgt, access,
+ return fetch_filetype('prof', 'profiles', sid, src, tgt, access,
session, skip_existing, fetch_client, log)
# ------------------------------------------------------------------------------
#
-def fetch_logfiles (sid, dburl=None, src=None, tgt=None, access=None,
+def fetch_logfiles (sid, src=None, tgt=None, access=None,
session=None, skip_existing=False, fetch_client=False, log=None):
- return fetch_filetype('log', 'logfiles', sid, dburl, src, tgt, access,
+ return fetch_filetype('log', 'logfiles', sid, src, tgt, access,
session, skip_existing, fetch_client, log)
diff --git a/src/radical/pilot/worker/__init__.py b/src/radical/pilot/worker/__init__.py
index f71925bd9f..7ba678592d 100644
--- a/src/radical/pilot/worker/__init__.py
+++ b/src/radical/pilot/worker/__init__.py
@@ -2,7 +2,6 @@
__copyright__ = "Copyright 2016, http://radical.rutgers.edu"
__license__ = "MIT"
-from .update import Update
from .stager import Stager
diff --git a/src/radical/pilot/worker/stager.py b/src/radical/pilot/worker/stager.py
index f0768f3647..a35463fcf1 100644
--- a/src/radical/pilot/worker/stager.py
+++ b/src/radical/pilot/worker/stager.py
@@ -37,8 +37,7 @@ def __init__(self, cfg, session):
#
def initialize(self):
- self._sid = self._cfg['sid']
- self._dburl = self._cfg['dburl']
+ self._sid = self._cfg['sid']
self.register_input(rps.NEW, rpc.STAGER_REQUEST_QUEUE, self.work)
self.register_publisher(rpc.STAGER_RESPONSE_PUBSUB)
diff --git a/src/radical/pilot/worker/update.py b/src/radical/pilot/worker/update.py
deleted file mode 100644
index f58e0cb445..0000000000
--- a/src/radical/pilot/worker/update.py
+++ /dev/null
@@ -1,245 +0,0 @@
-
-__copyright__ = "Copyright 2016, http://radical.rutgers.edu"
-__license__ = "MIT"
-
-
-import time
-import pymongo
-
-import radical.utils as ru
-
-from .. import utils as rpu
-from .. import constants as rpc
-
-from ..db import DBSession
-
-
-# ------------------------------------------------------------------------------
-#
-DEFAULT_BULK_COLLECTION_TIME = 1.0 # seconds
-DEFAULT_BULK_COLLECTION_SIZE = 100 # seconds
-
-
-# ------------------------------------------------------------------------------
-#
-class Update(rpu.Worker):
- '''
- An UpdateWorker pushes Task and Pilot state updates to mongodb. Its instances
- compete for update requests on the update_queue. Those requests will be
- triplets of collection name, query dict, and update dict. Update requests
- will be collected into bulks over some time (BULK_COLLECTION_TIME) and
- number (BULK_COLLECTION_SIZE) to reduce number of roundtrips.
- '''
-
- # --------------------------------------------------------------------------
- #
- def __init__(self, cfg, session):
-
- rpu.Worker.__init__(self, cfg, session)
-
-
- # --------------------------------------------------------------------------
- #
- def initialize(self):
-
- self._sid = self._cfg['sid']
- self._dburl = self._cfg['dburl']
-
- # get db handle from a connected, non-primary session
- self._dbs = DBSession(self._sid, self._dburl, {}, self._log, connect=True)
- self._coll = self._dbs._c
- self._bulk = self._coll.initialize_ordered_bulk_op()
- self._last = time.time() # time of last bulk push
- self._uids = list() # list of collected uids
- self._lock = ru.Lock() # protect _bulk
-
- self._bulk_time = self._cfg.bulk_time
- self._bulk_size = self._cfg.bulk_size
-
- self.register_subscriber(rpc.STATE_PUBSUB, self._state_cb)
- self.register_timed_cb(self._idle_cb, timer=self._bulk_time)
-
-
- # --------------------------------------------------------------------------
- #
- @classmethod
- def create(cls, cfg, session):
-
- return cls(cfg, session)
-
-
- # --------------------------------------------------------------------------
- #
- def _timed_bulk_execute(self, flush=False):
-
- # is there anything to execute?
- if not self._uids:
- return True
-
- now = time.time()
- age = now - self._last
-
- # only push if flush is forced, or when collection time or size
- # have been exceeded
- if not flush \
- and age < self._bulk_time \
- and len(self._uids) < self._bulk_size:
- return False
-
- try:
- self._bulk.execute()
-
- except pymongo.errors.OperationFailure as e:
- self._log.exception('bulk exec error: %s' % e.details)
- raise
-
- except Exception as e:
- self._log.exception('mongodb error: %s', e)
- raise
-
- self._prof.prof('update_pushed', msg='bulk size: %d' % len(self._uids))
-
- # for entry in self._uids:
- #
- # uid = entry[0]
- # state = entry[2]
- #
- # if state:
- # self._prof.prof('update_pushed', uid=uid, msg=state)
- # else:
- # self._prof.prof('update_pushed', uid=uid)
-
- # empty bulk, refresh state
- self._last = now
- self._bulk = self._coll.initialize_ordered_bulk_op()
- self._uids = list()
-
- return True
-
-
- # --------------------------------------------------------------------------
- #
- def _idle_cb(self):
-
- with self._lock:
- self._timed_bulk_execute()
-
- return True
-
-
- # --------------------------------------------------------------------------
- #
- def _state_cb(self, topic, msg):
- '''
-
- # FIXME: this documentation is not final, nor does it reflect reality!
-
- 'msg' is expected to be of the form ['cmd', 'thing'], where 'thing' is
- an entity to update in the DB, and 'cmd' specifies the mode of update.
-
- 'things' are expected to be dicts with a 'type' and 'uid' field. If
- either one does not exist, an exception is raised.
-
- Supported types are:
-
- - task
- - pilot
-
- supported 'cmds':
-
- - delete : delete can be delayed until bulk is collected/flushed
- - update : update can be delayed until bulk is collected/flushed
- - state : update can be delayed until bulk is collected/flushed
- only state and state history are updated
- - delete_flush: delete is sent immediately (possibly in a bulk)
- - update_flush: update is sent immediately (possibly in a bulk)
- - state_flush : update is sent immediately (possibly in a bulk)
- only state and state history are updated
- - flush : flush pending bulk
-
- The 'thing' can contains '$set' and '$push' fields, which will then be
- used as given. For all other fields, we use the following convention:
-
- - scalar values: use '$set'
- - dict values: use '$set'
- - list values: use '$push'
-
- That implies that all potential 'list' types should be defined in the
- initial 'thing' insert as such, as (potentially empty) lists.
-
- For 'cmd' in ['state', 'state_flush'], only the 'uid' and 'state' fields
- of the given 'thing' are used, all other fields are ignored. If 'state'
- does not exist, an exception is raised.
- '''
-
- try:
- cmd = msg['cmd']
- things = msg['arg']
-
- # cmds = ['delete', 'update', 'state',
- # 'delete_flush', 'update_flush', 'state_flush', 'flush']
- if cmd not in ['update', 'insert']:
- return True
-
- if cmd == 'insert':
- self._dbs.insert_tasks(ru.as_list(things))
- return True
-
-
- # FIXME: we don't have any error recovery -- any failure to update
- # state in the DB will thus result in an exception here and tear
- # down the module.
- for thing in ru.as_list(things):
-
- # got a new request. Add to bulk (create as needed),
- # and push bulk if time is up.
- uid = thing['uid']
- ttype = thing['type']
- state = thing['state']
-
- if 'clone' in uid:
- # we don't push clone states to DB
- return True
-
- # self._prof.prof('update_request', msg=state, uid=uid)
-
- if not state:
- # nothing to push
- return True
-
- # create an update document
- update_dict = dict()
- update_dict['$set'] = dict()
- update_dict['$push'] = dict()
-
- for key,val in thing.items():
- # never set _id, states (to avoid index clash, doubled ops)
- if key not in ['_id', 'states', 'cmds']:
- update_dict['$set'][key] = val
-
- # we set state, put (more importantly) we push the state onto
- # the 'states' list, so that we can later get state progression
- # in sync with the state model, even if they have been pushed
- # here out-of-order
- update_dict['$push']['states'] = state
-
- with self._lock:
-
- # push the update request onto the bulk
- self._uids.append([uid, ttype, state])
- self._bulk.find ({'uid' : uid,
- 'type': ttype}) \
- .update(update_dict)
-
- with self._lock:
- # attempt a timed update
- self._timed_bulk_execute()
-
- return True
-
- except:
- return False
-
-
-# ------------------------------------------------------------------------------
-
diff --git a/tests/component_tests/task_overlay_worker/drive_worker.py b/tests/component_tests/task_overlay_worker/drive_worker.py
index 23ca9caf30..92efdf8bb3 100755
--- a/tests/component_tests/task_overlay_worker/drive_worker.py
+++ b/tests/component_tests/task_overlay_worker/drive_worker.py
@@ -29,9 +29,9 @@
} for i in range(n)]
])
- for i in range(n):
- for res in q_out.get():
- print('%s: %s' % (res['req'], res['out']))
+ # for i in range(n):
+ # for res in q_out.get():
+ # print('%s: %s' % (res['req'], res['out']))
# ------------------------------------------------------------------------------
diff --git a/tests/component_tests/test_cases/user_cfg.json b/tests/component_tests/test_cases/user_cfg.json
index b5d1c36cd9..f8a02817cd 100644
--- a/tests/component_tests/test_cases/user_cfg.json
+++ b/tests/component_tests/test_cases/user_cfg.json
@@ -1,10 +1,11 @@
{
"user_resource": {
- "schemas" : ["local"],
- "local" :
- {
- "job_manager_endpoint" : "fork://localhost/",
- "filesystem_endpoint" : "file://localhost/"
+ "default_schema" : "local",
+ "schemas" : {
+ "local" : {
+ "job_manager_endpoint": "fork://localhost",
+ "filesystem_endpoint" : "file://localhost"
+ }
},
"default_remote_workdir" : "$HOME",
"resource_manager" : "FORK",
diff --git a/tests/component_tests/test_component.py b/tests/component_tests/test_component.py
index ec0c57b22c..c88ebf76b3 100755
--- a/tests/component_tests/test_component.py
+++ b/tests/component_tests/test_component.py
@@ -10,7 +10,8 @@
import radical.utils as ru
-from radical.pilot.utils.component import Component, ComponentManager
+from radical.pilot.utils.component import Component
+from radical.pilot.utils.component_manager import ComponentManager
# ------------------------------------------------------------------------------
@@ -42,6 +43,9 @@ def test_output(self, mocked_init):
@mock.patch('radical.utils.sh_callout', return_value=('', '', 0))
def test_cm_start_components(self, mocked_sh_callout, mocked_init):
+ # FIXME: heartbeats use the sessions HB channel which we don't have
+ return
+
cfg = {
'path' : '/tmp',
'heartbeat' : {'timeout': 10},
@@ -51,15 +55,21 @@ def test_cm_start_components(self, mocked_sh_callout, mocked_init):
}
}
- cm = ComponentManager(None)
- cm._uids = []
- cm._uid = 'cm.0000'
- cm._sid = 'session.0000'
- cm._cfg = ru.Config(cfg=cfg)
- cm._log = cm._prof = cm._hb = mock.Mock()
+ cm = ComponentManager('sid', 'reg_addr', 'owner')
+ cm._uids = []
+ cm._uid = 'cm.0000'
+ cm._sid = 'session.0000'
+ cm._owner = 'cm.0000'
+ cm._cfg = ru.Config(cfg=cfg)
+ cm._log = cm._prof = cm._hb = mock.Mock()
cm._hb.wait_startup = mock.Mock(return_value=0)
+ cm._heartbeats = dict()
+ cm._hb_cfg = ru.TypedDict({'timeout': 10})
+
+ cm._reg = ru.Config()
+ cm._reg_addr = None
- cm.start_components()
+ cm.start_components(ru.Config(cfg=cfg['components']))
for cname, ccfg in cfg['components'].items():
for fname in glob.glob('%s/%s*.json' % (cfg['path'], cname)):
diff --git a/tests/component_tests/test_session.py b/tests/component_tests/test_session.py
index 66430104e2..2249295aa8 100755
--- a/tests/component_tests/test_session.py
+++ b/tests/component_tests/test_session.py
@@ -1,6 +1,8 @@
+#!/usr/bin/env python3
+
# pylint: disable=protected-access, unused-argument, no-value-for-parameter
-__copyright__ = 'Copyright 2020-2022, The RADICAL-Cybertools Team'
+__copyright__ = 'Copyright 2020-2023, The RADICAL-Cybertools Team'
__license__ = 'MIT'
import glob
@@ -23,13 +25,17 @@ class TestSession(TestCase):
# --------------------------------------------------------------------------
#
@classmethod
- @mock.patch.object(Session, '_initialize_primary', return_value=None)
@mock.patch.object(Session, '_get_logger')
@mock.patch.object(Session, '_get_profiler')
@mock.patch.object(Session, '_get_reporter')
def setUpClass(cls, *args, **kwargs) -> None:
- cls._session = Session()
+ def init_primary(self):
+ self._reg = mock.Mock()
+ self._init_cfg_from_scratch()
+
+ with mock.patch.object(Session, '_init_primary', new=init_primary):
+ cls._session = Session()
cls._cleanup_files.append(cls._session.uid)
# --------------------------------------------------------------------------
@@ -38,6 +44,8 @@ def setUpClass(cls, *args, **kwargs) -> None:
def tearDownClass(cls) -> None:
for p in cls._cleanup_files:
+ if not p:
+ continue
for f in glob.glob(p):
if os.path.isdir(f):
try:
@@ -64,12 +72,14 @@ def test_get_resource_config(self):
# schemas are ["ssh", "gsissh"]
rcfg = self._session.get_resource_config(rcfg_label)
+
+ default_schema = rcfg.default_schema
self.assertEqual(rcfg.job_manager_endpoint,
- rcfg[rcfg.schemas[0]].job_manager_endpoint)
+ rcfg.schemas[default_schema].job_manager_endpoint)
new_schema = 'gsissh'
rcfg = self._session.get_resource_config(rcfg_label, schema=new_schema)
self.assertEqual(rcfg.job_manager_endpoint,
- rcfg[new_schema].job_manager_endpoint)
+ rcfg.schemas[new_schema].job_manager_endpoint)
# check exceptions
@@ -85,80 +95,82 @@ def test_get_resource_config(self):
# --------------------------------------------------------------------------
#
- @mock.patch.object(Session, '_initialize_primary', return_value=None)
@mock.patch.object(Session, '_get_logger')
@mock.patch.object(Session, '_get_profiler')
@mock.patch.object(Session, '_get_reporter')
- @mock.patch('radical.pilot.session.ru.Config')
- def test_resource_schema_alias(self, mocked_config, *args, **kwargs):
-
- mocked_config.return_value = ru.TypedDict({
- 'local': {
- 'test': {
- 'schemas' : ['schema_origin',
- 'schema_alias',
- 'schema_alias_alias'],
- 'schema_origin' : {'param_0': 'value_0'},
+ def test_resource_schema_alias(self, *args, **kwargs):
+
+ base_dir = os.path.join(os.path.expanduser('~'), '.radical')
+ self._cleanup_files.append(base_dir)
+
+ user_cfg_dir = os.path.join(base_dir, 'pilot', 'configs')
+ ru.rec_makedir(user_cfg_dir)
+
+ facility_cfg = {
+ 'test': {
+ 'default_schema' : 'schema_origin',
+ 'schemas' : {
+ 'schema_origin' : {'job_manager_hop': 'value_0'},
'schema_alias' : 'schema_origin',
'schema_alias_alias': 'schema_alias'
}
}
- })
+ }
+ ru.write_json(facility_cfg, '%s/resource_facility.json' % user_cfg_dir)
+
+ def init_primary(self):
+ self._reg = mock.Mock()
+ self._init_cfg_from_scratch()
- s_alias = Session()
+ with mock.patch.object(Session, '_init_primary', new=init_primary):
+ s_alias = Session()
+ self._cleanup_files.append(s_alias.uid)
self.assertEqual(
- s_alias._rcfgs.local.test.schema_origin,
- s_alias._rcfgs.local.test.schema_alias)
+ s_alias._rcfgs.facility.test.schema_origin,
+ s_alias._rcfgs.facility.test.schema_alias)
self.assertEqual(
- s_alias._rcfgs.local.test.schema_origin,
- s_alias._rcfgs.local.test.schema_alias_alias)
+ s_alias._rcfgs.facility.test.schema_origin,
+ s_alias._rcfgs.facility.test.schema_alias_alias)
self.assertEqual(
- s_alias.get_resource_config('local.test', 'schema_origin'),
- s_alias.get_resource_config('local.test', 'schema_alias_alias'))
-
- self._cleanup_files.append(s_alias.uid)
-
- with self.assertRaises(KeyError):
- # schema alias refers to unknown schema
- mocked_config.return_value = ru.TypedDict({
- 'local': {
- 'test': {
- 'schemas' : ['schema_alias_error'],
- 'schema_alias_error': 'unknown_schema'
- }
+ s_alias.get_resource_config('facility.test', 'schema_origin'),
+ s_alias.get_resource_config('facility.test', 'schema_alias_alias'))
+
+ # schema alias refers to unknown schema
+ facility_cfg = {
+ 'test': {
+ 'default_schema': 'schema_alias_error',
+ 'schemas': {
+ 'schemas': ['schema_alias_error'],
+ 'schema_alias_error': 'unknown_schema'
}
- })
- Session()
+ }
+ }
+ ru.write_json(facility_cfg, '%s/resource_facility.json' % user_cfg_dir)
+ with self.assertRaises(KeyError):
+ with mock.patch.object(Session, '_init_primary', new=init_primary):
+ Session()
# --------------------------------------------------------------------------
#
- @mock.patch.object(Session, 'created', return_value=0)
- @mock.patch.object(Session, 'closed', return_value=0)
- def test_close(self, mocked_closed, mocked_created):
+ def test_close(self):
+
+ class Dummy():
+ def put(*args, **kwargs):
+ pass
# check default values
- self.assertFalse(self._session._close_options.cleanup)
self.assertFalse(self._session._close_options.download)
self.assertTrue(self._session._close_options.terminate)
- # only `True` values are targeted
-
- self._session._closed = False
- self._session.close(cleanup=True)
- self.assertTrue(self._session._close_options.cleanup)
+ self._session._ctrl_pub = Dummy()
+ self._session._hb = mock.Mock()
+ self._session._hb_pubsub = mock.Mock()
+ self._session._reg_service = mock.Mock()
- self._session._closed = False
- self._session.fetch_json = mock.Mock()
- self._session.fetch_profiles = mock.Mock()
- self._session.fetch_logfiles = mock.Mock()
+ # only `True` values are targeted
self._session.close(download=True)
- self._session.fetch_json.assert_called()
- self._session.fetch_profiles.assert_called()
- self._session.fetch_logfiles.assert_called()
-
- self._session._closed = False
- self._session.close(cleanup=True, terminate=True)
+ self._session.close(terminate=True)
# --------------------------------------------------------------------------
#
@@ -207,6 +219,7 @@ def test_get_resource_sandbox(self):
if __name__ == '__main__':
tc = TestSession()
+ tc.setUpClass()
tc.test_list_resources()
tc.test_get_resource_config()
tc.test_resource_schema_alias()
diff --git a/tests/integration_tests/test_agent_bridge.py b/tests/integration_tests/test_agent_bridge.py
new file mode 100755
index 0000000000..2c88bb9e7e
--- /dev/null
+++ b/tests/integration_tests/test_agent_bridge.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+
+import sys
+import time
+
+import radical.utils as ru
+
+
+# ------------------------------------------------------------------------------
+#
+def test_agent_bridge(url=None):
+
+ if not url:
+ return
+
+ bridge = ru.zmq.Client(url=url)
+
+ sid = 'foo'
+
+ try:
+ print(bridge.request('client_register', {'sid': sid}))
+ print(bridge.request('client_lookup', {'sid': sid}))
+ time.sleep(3)
+ print(bridge.request('client_heartbeat', {'sid': sid}))
+ time.sleep(3)
+ print(bridge.request('client_heartbeat', {'sid': sid}))
+ time.sleep(3)
+ print(bridge.request('client_heartbeat', {'sid': sid}))
+ time.sleep(3)
+ print(bridge.request('client_heartbeat', {'sid': sid}))
+ time.sleep(3)
+ print(bridge.request('client_heartbeat', {'sid': sid}))
+ print(bridge.request('client_lookup', {'sid': sid}))
+ print(bridge.request('client_unregister', {'sid': sid}))
+ print(bridge.request('client_lookup', {'sid': sid}))
+
+ finally:
+ bridge.close()
+
+
+# ------------------------------------------------------------------------------
+#
+if __name__ == '__main__':
+
+ url = None
+ if len(sys.argv) > 1:
+ url = sys.argv[1]
+
+ test_agent_bridge(url)
+
+
+# ------------------------------------------------------------------------------
+
diff --git a/tests/integration_tests/test_lm/test_jsrun.py b/tests/integration_tests/test_lm/test_jsrun.py
index 88a1204e1d..ecaceeaf63 100755
--- a/tests/integration_tests/test_lm/test_jsrun.py
+++ b/tests/integration_tests/test_lm/test_jsrun.py
@@ -72,7 +72,6 @@ def test_command(self, mocked_init):
result = test_case['result']
for i in range(len(result)):
if '{node}' in result[i]:
- print(result[i])
result[i] = result[i].format(node=self.node_name)
log = mock.Mock()
@@ -83,7 +82,7 @@ def test_command(self, mocked_init):
component._init_from_scratch(None, None)
# FIXME
- command, _ = component.construct_command(task, None)
+ command, _ = component.get_launch_cmds(task, '.')
p = sp.Popen(command, stdout=sp.PIPE,
stderr=sp.PIPE, shell=True)
diff --git a/tests/test_raptor/test_raptor.py b/tests/test_raptor/test_raptor.py
index 104b02a6d5..4fdc5a74e8 100755
--- a/tests/test_raptor/test_raptor.py
+++ b/tests/test_raptor/test_raptor.py
@@ -17,22 +17,56 @@
class TestWorker(TestCase):
def read_json_side_effect(self, fname=None):
- return {'sub': '', 'pub': '', 'cores_per_rank': 8, 'gpus_per_rank': 2}
+ return {'addr_sub': '', 'addr_pub': '', 'cores_per_rank': 8, 'gpus_per_rank': 2}
+ def dict_merge_side_effect(self, fname=None):
+ return {'addr_sub': '', 'addr_pub': '', 'cores_per_rank': 8, 'gpus_per_rank': 2}
+ class MyConfig(ru.TypedDict):
+ def __init__(self, cfg=None, from_dict=None):
+ if cfg: super().__init__(from_dict=cfg)
+ else : super().__init__(from_dict=from_dict)
+
+ class MyRegistry(ru.TypedDict):
+
+ def __init__(self, url):
+
+ data = {
+ 'cfg': {},
+ 'bridges.state_pubsub': {
+ 'addr_sub': 'tcp://localhost:10000',
+ 'addr_pub': 'tcp://localhost:10001'
+ },
+ 'bridges.control_pubsub': {
+ 'addr_sub': 'tcp://localhost:10000',
+ 'addr_pub': 'tcp://localhost:10001'
+ },
+ 'raptor.task.000000.cfg': {
+ 'cores_per_rank': 8,
+ 'gpus_per_rank' : 2
+ }
+ }
+
+ super().__init__(from_dict=data)
+
+ def dump(self, *args, **kwargs): pass
+
+
+ @mock.patch('radical.utils.zmq.RegistryClient', MyRegistry)
@mock.patch('radical.utils.zmq.Subscriber')
@mock.patch('radical.utils.zmq.Publisher')
@mock.patch('radical.utils.zmq.Putter')
@mock.patch('radical.utils.read_json', side_effect=read_json_side_effect)
+ @mock.patch('radical.utils.Config', MyConfig)
@mock.patch('threading.Event')
@mock.patch('threading.Thread')
- def test_alloc(self, mock_1, mock_2, mock_3, mock_4, mock_5, mock_6):
+ def test_alloc(self, *args):
- cfg = ru.Config(cfg={'uid' : 'worker.0000',
- 'sid' : str(time.time()),
- 'info' : {},
- 'cores_per_rank': 8,
- 'gpus_per_rank' : 2})
+ cfg = ru.Config(from_dict={'uid' : 'worker.0000',
+ 'sid' : str(time.time()),
+ 'info' : {},
+ 'cores_per_rank': 8,
+ 'gpus_per_rank' : 2})
ru.zmq.Subscriber = mock.Mock()
ru.zmq.Publisher = mock.Mock()
@@ -44,21 +78,21 @@ def test_alloc(self, mock_1, mock_2, mock_3, mock_4, mock_5, mock_6):
ru.zmq.Putter = mock.Mock()
ru.zmq.Getter = mock.Mock()
- rp.raptor.Worker.publish = mock.Mock()
- rp.raptor.Worker._ts_addr = 'tcp://localhost:1'
- rp.raptor.Worker._res_addr_put = 'tcp://localhost:2'
- rp.raptor.Worker._req_addr_get = 'tcp://localhost:3'
+ rp.raptor.Worker.publish = mock.Mock()
+ rp.raptor.Worker._ts_addr = 'tcp://localhost:1'
+ rp.raptor.Worker._res_addr_put = 'tcp://localhost:2'
+ rp.raptor.Worker._req_addr_get = 'tcp://localhost:3'
- os.environ['RP_TASK_ID'] = 'task.000000'
- os.environ['RP_TASK_SANDBOX'] = '/tmp'
- os.environ['RP_PILOT_SANDBOX'] = '/tmp'
- os.environ['RP_RANKS'] = str(8)
+ os.environ['cores_per_rank'] = '8'
+ os.environ['gpus_per_rank'] = '2'
+ os.environ['RP_TASK_ID'] = 'task.000000'
+ os.environ['RP_TASK_SANDBOX'] = '/tmp'
+ os.environ['RP_PILOT_SANDBOX'] = '/tmp'
+ os.environ['RP_RANKS'] = str(8)
+ os.environ['RP_SESSION_ID'] = 'foo'
+ os.environ['RP_REGISTRY_ADDRESS'] = 'tcp://localhost:10001'
- with ru.ru_open('/tmp/control_pubsub.cfg', 'w') as fout:
- fout.write('{"sub": "tcp://localhost:10000", '
- ' "pub": "tcp://localhost:10001"}\n')
-
- worker = rp.raptor.DefaultWorker(cfg)
+ worker = rp.raptor.DefaultWorker('master.0000')
task_1 = {'uid': 'task.0000', 'cores': 1, 'gpus' : 1}
task_2 = {'uid': 'task.0001', 'cores': 2, 'gpus' : 1}
@@ -99,8 +133,6 @@ def test_alloc(self, mock_1, mock_2, mock_3, mock_4, mock_5, mock_6):
self.assertEqual(worker._resources['cores'], [0, 0, 0, 0, 0, 0, 0, 0])
self.assertEqual(worker._resources['gpus' ], [0, 0])
- os.unlink('/tmp/control_pubsub.cfg')
-
# ------------------------------------------------------------------------------
#
diff --git a/tests/unit_tests/test_agent_0/test_agent_0.py b/tests/unit_tests/test_agent_0/test_agent_0.py
index 1b348b0bad..8e35b1a7a3 100755
--- a/tests/unit_tests/test_agent_0/test_agent_0.py
+++ b/tests/unit_tests/test_agent_0/test_agent_0.py
@@ -9,6 +9,8 @@
from unittest import mock, TestCase
+from radical.pilot.messages import RPCRequestMessage, RPCResultMessage
+
import radical.utils as ru
import radical.pilot as rp
@@ -22,13 +24,20 @@ class TestComponent(TestCase):
_cleanup_files = []
+ def _init_primary_side_effect(self):
+
+ self._log = mock.MagicMock()
+ self._prof = mock.MagicMock()
+ self._rep = mock.MagicMock()
+ self._reg = mock.MagicMock()
+
+
# --------------------------------------------------------------------------
#
@classmethod
- @mock.patch.object(rp.Session, '_initialize_primary', return_value=None)
- @mock.patch.object(rp.Session, '_get_logger')
- @mock.patch.object(rp.Session, '_get_profiler')
- @mock.patch.object(rp.Session, '_get_reporter')
+ @mock.patch.object(rp.Session, '_init_primary',
+ side_effect=_init_primary_side_effect,
+ autospec=True)
def setUpClass(cls, *args, **kwargs) -> None:
cls._session = rp.Session()
@@ -40,6 +49,8 @@ def setUpClass(cls, *args, **kwargs) -> None:
def tearDownClass(cls) -> None:
for p in cls._cleanup_files:
+ if p is None:
+ continue
for f in glob.glob(p):
if os.path.isdir(f):
try:
@@ -52,87 +63,50 @@ def tearDownClass(cls) -> None:
# --------------------------------------------------------------------------
#
@mock.patch.object(Agent_0, '__init__', return_value=None)
- def test_check_control(self, mocked_init):
+ def test_check_control_cb(self, mocked_init):
global_control = []
def _publish_effect(publish_type, cmd):
nonlocal global_control
+ import pprint
+ print('=============== pub', pprint.pformat(cmd))
global_control.append((publish_type, cmd))
- def _prepenv_effect(env_id, spec):
- return (env_id, spec)
+ def _prepenv_effect(env_name, env_spec):
+ return env_name, env_spec
- agent_cmp = Agent_0(ru.Config(), self._session)
+ agent_cmp = Agent_0()
agent_cmp._log = mock.Mock()
+ agent_cmp._prof = mock.Mock()
+ agent_cmp._pid = 'pilot.0000'
agent_cmp.publish = mock.MagicMock(side_effect=_publish_effect)
agent_cmp._prepare_env = mock.MagicMock(side_effect=_prepenv_effect)
+ agent_cmp._rpc_handlers = {'prepare_env': (agent_cmp._prepare_env, None)}
+
msg = {'cmd': 'test',
'arg': {'uid': 'rpc.0000',
'rpc': 'bye'}
}
- self.assertTrue(agent_cmp._check_control(None, msg))
+ self.assertIsNone(agent_cmp._control_cb(None, msg))
self.assertEqual(global_control, [])
- msg = {'cmd': 'rpc_req',
- 'arg': {'uid': 'rpc.0001',
- 'rpc': 'bye'}
- }
- self.assertTrue(agent_cmp._check_control(None, msg))
+ msg = RPCRequestMessage({'cmd': 'bye', 'kwargs': {'uid': 'rpc.0001'}})
+ self.assertIsNone(agent_cmp._control_cb(None, msg))
self.assertEqual(global_control, [])
- msg = {'cmd': 'rpc_req',
- 'arg': {'uid': 'rpc.0002',
- 'rpc': 'hello'}
- }
- self.assertTrue(agent_cmp._check_control(None, msg))
- self.assertIn(global_control[0], [('control_pubsub',
- {'cmd': 'rpc_res',
- 'arg': {'uid': 'rpc.0002',
- 'err': "KeyError('arg')",
- 'out': None,
- 'ret': 1}
- }),
- ('control_pubsub',
- {'cmd': 'rpc_res',
- 'arg': {'uid': 'rpc.0002',
- 'err': "KeyError('arg',)",
- 'out': None,
- 'ret': 1}
- })])
-
- msg = {'cmd': 'rpc_req',
- 'arg': {'uid': 'rpc.0003',
- 'rpc': 'hello',
- 'arg': ['World']}
- }
- self.assertTrue(agent_cmp._check_control(None, msg))
- self.assertEqual(global_control[1], ('control_pubsub',
- {'cmd': 'rpc_res',
- 'arg': {'uid': 'rpc.0003',
- 'err': None,
- 'out': 'hello World',
- 'ret': 0}
- }))
-
- msg = {'cmd': 'rpc_req',
- 'arg': {'uid': 'rpc.0004',
- 'rpc': 'prepare_env',
- 'arg': {'env_name': 'radical',
- 'env_spec': 'spec'}
- }
- }
- self.assertTrue(agent_cmp._check_control(None, msg))
- self.assertEqual(global_control[2], ('control_pubsub',
- {'cmd': 'rpc_res',
- 'arg': {'uid': 'rpc.0004',
- 'err': None,
- 'out': ('radical',
- 'spec'),
- 'ret': 0}
- }))
+ msg = RPCRequestMessage({'cmd' : 'prepare_env',
+ 'uid' : 'rpc.0004',
+ 'kwargs': {'env_name': 'radical',
+ 'env_spec': 'spec'}})
+ self.assertIsNone(agent_cmp._control_cb(None, msg))
+ print('====', global_control, '====')
+ self.assertEqual(global_control[0],
+ ('control_pubsub',
+ RPCResultMessage({'uid': 'rpc.0004',
+ 'val': ('radical', 'spec')})))
# --------------------------------------------------------------------------
@@ -141,16 +115,21 @@ def _prepenv_effect(env_id, spec):
@mock.patch('radical.utils.env_prep')
@mock.patch('radical.utils.sh_callout_bg')
def test_start_sub_agents(self, mocked_run_sh_callout, mocked_ru_env_prep,
- mocked_init):
+ mocked_init):
+
+ agent_0 = Agent_0()
- agent_0 = Agent_0(ru.Config(), self._session)
agent_0._pwd = tempfile.gettempdir()
agent_0._log = mock.Mock()
- agent_0._cfg = ru.Config(from_dict={
+ agent_0._sid = 'rp.session.0'
+
+ agent_0._session = mock.Mock()
+ agent_0._session.cfg = ru.Config(from_dict={
'agents': {
'agent_1': {'target' : 'node',
'components': {'agent_executing': {'count': 1}}}
- }
+ },
+ 'reg_addr': 'tcp://location'
})
agent_0._rm = mock.Mock()
@@ -185,17 +164,17 @@ def check_agent_task(agent_task, *args, **kwargs):
agent_0._rm.find_launcher.return_value = launcher
agent_files = glob.glob('%s/agent_1.*.sh' % agent_0._pwd)
- self.assertEqual(len(agent_files), 0)
+ self.assertEqual(0, len(agent_files))
agent_0._start_sub_agents()
agent_files = glob.glob('%s/agent_1.*.sh' % agent_0._pwd)
- self.assertEqual(len(agent_files), 2)
+ self.assertEqual(2, len(agent_files))
for agent_file in agent_files:
os.unlink(agent_file)
# incorrect config setup for agent ('target' is in ['local', 'node'])
- agent_0._cfg['agents']['agent_1']['target'] = 'incorrect_target'
+ agent_0._session.cfg['agents']['agent_1']['target'] = 'incorrect_target'
with self.assertRaises(ValueError):
agent_0._start_sub_agents()
@@ -254,28 +233,28 @@ def local_advance(things, publish, push):
# --------------------------------------------------------------------------
#
@mock.patch.object(Agent_0, '__init__', return_value=None)
- def test_service_state_cb(self, mocked_init):
+ def test_ctrl_service_up(self, mocked_init):
agent_0 = Agent_0(ru.Config(), self._session)
agent_0._service_uids_launched = ['101', '102']
agent_0._service_uids_running = []
- agent_0._log = mock.Mock()
+ agent_0._pid = 'pilot_test.0000'
+ agent_0._log = mock.Mock()
+ agent_0._prof = mock.Mock()
agent_0._services_setup = mt.Event()
topic = 'test_topic'
- msg = {'cmd': 'update',
- 'arg': []}
+ msg = {'cmd': 'service_up',
+ 'arg': {}}
- msg['arg'].append({'uid' : '101',
- 'state': 'AGENT_EXECUTING'})
- agent_0._service_state_cb(topic, msg)
+ msg['arg']['uid'] = '101'
+ agent_0._control_cb(topic, msg)
self.assertFalse(agent_0._services_setup.is_set())
- msg['arg'].append({'uid' : '102',
- 'state': 'AGENT_EXECUTING'})
- agent_0._service_state_cb(topic, msg)
+ msg['arg']['uid'] = '102'
+ agent_0._control_cb(topic, msg)
self.assertTrue(agent_0._services_setup.is_set())
@@ -284,10 +263,10 @@ def test_service_state_cb(self, mocked_init):
if __name__ == '__main__':
tc = TestComponent()
- tc.test_check_control()
+ tc.test_check_control_cb()
tc.test_start_sub_agents()
tc.test_start_services()
- tc.test_service_state_cb()
+ tc.test_ctrl_service_up()
# ------------------------------------------------------------------------------
diff --git a/tests/unit_tests/test_executing/test_base.py b/tests/unit_tests/test_executing/test_base.py
index 8e4e3cdfb2..0f8df63e71 100755
--- a/tests/unit_tests/test_executing/test_base.py
+++ b/tests/unit_tests/test_executing/test_base.py
@@ -38,14 +38,14 @@ def work(self, tasks):
# method `create` is allowed to be called by the base class only
NewExecuting.create(cfg=None, session=None)
- spawners = [
- {'spawner': 'POPEN'},
- {'spawner': 'UNKNOWN'}
- ]
+ spawners = ['POPEN', 'UNKNOWN']
+
for spawner in spawners:
+ session = ru.Config(cfg={
+ 'rcfg': {'agent_spawner' : spawner}})
try:
- AgentExecutingComponent.create(cfg=spawner, session=None)
+ AgentExecutingComponent.create(cfg=spawner, session=session)
except:
# in case of spawner is not presented in `rpa.executing.base`
with self.assertRaises(ValueError):
@@ -62,16 +62,19 @@ def work(self, tasks):
def test_initialize(self, mocked_rm, mocked_init):
ec = AgentExecutingComponent(cfg=None, session=None)
- ec._cfg = ru.TypedDict(from_dict={
- 'sid' : 'sid.0000',
- 'resource_manager': 'FORK',
+
+ ec._session = mock.Mock()
+ ec._session.uid = 'sid.0000'
+ ec._session.cfg = ru.TypedDict(from_dict={
+ 'resource' : 'resource_config_label',
'resource_sandbox': '',
'session_sandbox' : '',
- 'pilot_sandbox' : '',
- 'resource' : 'resource_config_label',
- 'resource_cfg' : {'order': [],
- 'launch_methods': {'SRUN': {}}}
+ 'pilot_sandbox' : ''
})
+ ec._session.rcfg = ru.TypedDict(from_dict={
+ 'resource_manager': 'FORK',
+ 'agent_spawner' : 'POPEN'})
+
ec._log = ec._prof = mock.Mock()
ec.work = ec.control_cb = mock.Mock()
ec.register_input = ec.register_output = mock.Mock()
diff --git a/tests/unit_tests/test_executing/test_popen.py b/tests/unit_tests/test_executing/test_popen.py
index 042fcb94e3..d17a967fce 100755
--- a/tests/unit_tests/test_executing/test_popen.py
+++ b/tests/unit_tests/test_executing/test_popen.py
@@ -49,10 +49,10 @@ def test_control_cb(self, mocked_logger, mocked_init):
pex._watch_queue = queue.Queue()
msg = {'cmd': '', 'arg': {'uids': ['task.0000', 'task.0001']}}
- self.assertTrue(pex.control_cb(topic=None, msg=msg))
+ self.assertIsNone(pex.control_cb(topic=None, msg=msg))
msg['cmd'] = 'cancel_tasks'
- self.assertTrue(pex.control_cb(topic=None, msg=msg))
+ self.assertIsNone(pex.control_cb(topic=None, msg=msg))
for uid in msg['arg']['uids']:
mode, tid = pex._watch_queue.get()
self.assertEqual(mode, pex.TO_CANCEL)
@@ -81,7 +81,6 @@ def test_handle_task(self, mocked_sp_popen, mocked_lm_init,
pex._log = pex._prof = pex._watch_queue = mock.Mock()
pex._log._debug_level = 1
- pex._cfg = {'resource_cfg': {'new_session_per_task': False}}
pex._pwd = ''
pex._pid = 'pilot.0000'
pex.sid = 'session.0000'
@@ -92,6 +91,9 @@ def test_handle_task(self, mocked_sp_popen, mocked_lm_init,
pex.gtod = ''
pex.prof = ''
+ pex._session = mock.Mock()
+ pex._session.rcfg = ru.Config(from_dict={'new_session_per_task': False})
+
pex._rm = mock.Mock()
pex._rm.find_launcher = mocked_find_launcher
@@ -133,7 +135,9 @@ def test_handle_task(self, mocked_sp_popen, mocked_lm_init,
def test_extend_pre_exec(self, mocked_init):
pex = Popen(cfg=None, session=None)
- pex._cfg = {}
+
+ pex._session = mock.Mock()
+ pex._session.rcfg = {}
td = {'cores_per_rank': 2,
'threading_type': '',
@@ -149,7 +153,9 @@ def test_extend_pre_exec(self, mocked_init):
td.update({'threading_type': rpc.OpenMP,
'gpu_type' : rpc.CUDA})
- pex._cfg['task_pre_exec'] = ['export TEST_ENV=test']
+
+ # we target attribute "task_pre_exec"
+ pex._session.rcfg = {'task_pre_exec': ['export TEST_ENV=test']}
pex._extend_pre_exec(td, ranks)
self.assertIn('export OMP_NUM_THREADS=2', td['pre_exec'])
diff --git a/tests/unit_tests/test_launcher/test_launcher.py b/tests/unit_tests/test_launcher/test_launcher.py
index c7b416a118..f6c8276cfb 100755
--- a/tests/unit_tests/test_launcher/test_launcher.py
+++ b/tests/unit_tests/test_launcher/test_launcher.py
@@ -46,6 +46,11 @@ def _get_client_sandbox(self):
cls._session = Session()
cls._configs = ru.Config('radical.pilot.resource', name='*')
+ for site in cls._configs:
+ for k,v in cls._configs[site].items():
+ v['agent_proxy_url'] = 'tcp://localhost:1024'
+
+
# --------------------------------------------------------------------------
#
@mock.patch.object(PMGRLaunchingComponent, '__init__', return_value=None)
diff --git a/tests/unit_tests/test_pilot/test_pilot.py b/tests/unit_tests/test_pilot/test_pilot.py
index 776e9fdf07..f5c8a81f57 100755
--- a/tests/unit_tests/test_pilot/test_pilot.py
+++ b/tests/unit_tests/test_pilot/test_pilot.py
@@ -5,6 +5,7 @@
from unittest import mock
from unittest import TestCase
+import radical.utils as ru
import radical.pilot as rp
@@ -20,7 +21,14 @@ def test_pilot_uid(self, mocked_init):
pmgr._uid = 'pmgr.0000'
pmgr._log = mock.Mock()
pmgr._prof = mock.Mock()
- pmgr._session = mock.Mock()
+ pmgr._session = ru.Config(from_dict={'_reg': {
+ 'bridges.control_pubsub.addr_sub': 'tcp://localhost',
+ 'bridges.control_pubsub.addr_pub': 'tcp://localhost'}})
+
+ ru.zmq.Subscriber = mock.Mock()
+ ru.zmq.Publisher = mock.Mock()
+ ru.zmq.test_pubsub = mock.Mock()
+
pmgr._session.uid = str(time.time()) # restart uid counter
sandbox_url = mock.Mock()
sandbox_url.path = './'
diff --git a/tests/unit_tests/test_pmgr.py b/tests/unit_tests/test_pmgr.py
index a9b6f65490..85a85b8cfd 100644
--- a/tests/unit_tests/test_pmgr.py
+++ b/tests/unit_tests/test_pmgr.py
@@ -15,9 +15,11 @@ class PMGRTestCase(TestCase):
#
@mock.patch.object(PilotManager, '__init__', return_value=None)
@mock.patch.object(PilotManager, 'wait_pilots', return_value=None)
- def test_cancel_pilots(self, mocked_wait_pilots, mocked_init):
+ @mock.patch.object(PilotManager, 'publish', return_value=None)
+ def test_cancel_pilots(self, mocked_publish, mocked_wait_pilots, mocked_init):
pmgr = PilotManager(session=None)
+ pmgr._uid = 'pmgr.0000'
pmgr._pilots_lock = mt.RLock()
pmgr._log = mock.Mock()
pmgr._session = mock.Mock()
@@ -28,12 +30,8 @@ def test_cancel_pilots(self, mocked_wait_pilots, mocked_init):
pmgr.cancel_pilots()
- self.assertTrue(pmgr._session._dbs.pilot_command.called)
self.assertTrue(mocked_wait_pilots.called)
- args, kwargs = pmgr._session._dbs.pilot_command.call_args_list[0]
- self.assertEqual('cancel_pilot', args[0])
- self.assertIn('pilot.0000', args[2]) # pilot UIDs
# --------------------------------------------------------------------------
#
diff --git a/tests/unit_tests/test_pytask.py b/tests/unit_tests/test_pytask.py
index 39761c4117..18b3ba569d 100644
--- a/tests/unit_tests/test_pytask.py
+++ b/tests/unit_tests/test_pytask.py
@@ -24,8 +24,6 @@ def AB(z):
return 2 * z
wrapped_function = partial(AA, AB)
- print(type(wrapped_function))
- print(callable(wrapped_function))
pytask_class_obj = PythonTask(wrapped_function)
self.assertIsInstance(pytask_class_obj, str)
diff --git a/tests/unit_tests/test_raptor/test_master.py b/tests/unit_tests/test_raptor/test_master.py
index 00d1d6dc8c..81b8fbb103 100644
--- a/tests/unit_tests/test_raptor/test_master.py
+++ b/tests/unit_tests/test_raptor/test_master.py
@@ -21,13 +21,21 @@ class RaptorMasterTC(TestCase):
_cleanup_files = []
+ def _init_primary_side_effect(self):
+
+ self._log = mock.MagicMock()
+ self._prof = mock.MagicMock()
+ self._rep = mock.MagicMock()
+ self._reg = mock.MagicMock()
+ self._uid = 'session.001'
+
+
# --------------------------------------------------------------------------
#
@classmethod
- @mock.patch.object(rp.Session, '_initialize_primary', return_value=None)
- @mock.patch.object(rp.Session, '_get_logger')
- @mock.patch.object(rp.Session, '_get_profiler')
- @mock.patch.object(rp.Session, '_get_reporter')
+ @mock.patch.object(rp.Session, '_init_primary',
+ side_effect=_init_primary_side_effect,
+ autospec=True)
def setUpClass(cls, *args, **kwargs) -> None:
cls._session = rp.Session()
@@ -39,6 +47,8 @@ def setUpClass(cls, *args, **kwargs) -> None:
def tearDownClass(cls) -> None:
for p in cls._cleanup_files:
+ if p is None:
+ continue
for f in glob.glob(p):
if os.path.isdir(f):
try:
diff --git a/tests/unit_tests/test_raptor/test_worker.py b/tests/unit_tests/test_raptor/test_worker.py
index 8e595fc488..778bcbc59a 100755
--- a/tests/unit_tests/test_raptor/test_worker.py
+++ b/tests/unit_tests/test_raptor/test_worker.py
@@ -94,7 +94,6 @@ def test_exec(self, mocked_init, mocked_Logger):
# component._log = mocked_Logger
# data = {'code': '2 + 5'}
# out, err, ret, val = component._exec(data)
- # print('===', [out, err, ret, val])
#
# self.assertEqual(ret, 0)
# self.assertEqual(val, {7})
diff --git a/tests/unit_tests/test_rm/test_base.py b/tests/unit_tests/test_rm/test_base.py
index 8f9a4499da..2063bf4a2e 100755
--- a/tests/unit_tests/test_rm/test_base.py
+++ b/tests/unit_tests/test_rm/test_base.py
@@ -45,6 +45,7 @@ def test_init_from_registry(self, mocked_prof, mocked_log, mocked_lm):
c.close()
rm = ResourceManager(cfg=ru.TypedDict({'reg_addr': reg.addr}),
+ rcfg=ru.TypedDict(),
log=mock.Mock(), prof=mock.Mock())
self.assertIsInstance(rm.info, RMInfo)
@@ -71,13 +72,16 @@ def test_init_from_scratch(self, mocked_init):
'lfs_size_per_node': 100,
'resource_cfg' : {}})
- rm = ResourceManager(cfg=None, log=None, prof=None)
+ rm = ResourceManager(cfg=None, rcfg=None, log=None, prof=None)
rm._cfg = cfg
+ rm._rcfg = ru.Config(cfg={})
+
rm._log = mock.Mock()
rm._prof = mock.Mock()
def _init_from_scratch(rm_info):
rm_info.node_list = rm._get_node_list([('node00', 16)], rm_info)
+ rm_info.cores_per_node = rm_info['cores_per_node']
return rm_info
# RM specific method (to update node_list and cores_per_node if needed)
@@ -108,7 +112,7 @@ def test_cores_cpus_map(self, mocked_init):
tc_map = ru.read_json('%s/test_cases/test_cores_gpus_map.json' % base)
- rm = ResourceManager(cfg=None, log=None, prof=None)
+ rm = ResourceManager(cfg=None, rcfg=None, log=None, prof=None)
rm._log = mock.Mock()
rm._prof = mock.Mock()
@@ -119,11 +123,14 @@ def test_cores_cpus_map(self, mocked_init):
def _init_from_scratch(rm_info_tc, rm_info_input):
_rm_info = ru.TypedDict(rm_info_tc)
_rm_info.update(rm_info_input)
+
return _rm_info
from functools import partial
- rm._cfg = ru.TypedDict(rm_cfg)
+ rm._rcfg = ru.TypedDict(rm_cfg['rcfg'])
+ del rm_cfg['rcfg']
+ rm._cfg = ru.TypedDict(rm_cfg)
rm._init_from_scratch = partial(_init_from_scratch, rm_info)
if result == 'AssertionError':
@@ -139,7 +146,7 @@ def _init_from_scratch(rm_info_tc, rm_info_input):
@mock.patch.object(ResourceManager, '__init__', return_value=None)
def test_set_info(self, mocked_init):
- rm = ResourceManager(cfg=None, log=None, prof=None)
+ rm = ResourceManager(cfg=None, rcfg=None, log=None, prof=None)
with self.assertRaises(KeyError):
# required attributes are missed
@@ -171,7 +178,8 @@ def test_find_launcher(self, mocked_lm, mocked_init):
cfg = ru.TypedDict({
'cores' : 16,
'gpus' : 2,
- 'resource_cfg' : {
+ })
+ rcfg = ru.TypedDict({
'cores_per_node' : 16,
'gpus_per_node' : 2,
'lfs_path_per_node': '${LOCAL}',
@@ -179,9 +187,9 @@ def test_find_launcher(self, mocked_lm, mocked_init):
'launch_methods' : {
'order': ['SRUN'],
'SRUN' : {}
- }}})
+ }})
- rm = ResourceManager.create('FORK', cfg, None, None)
+ rm = ResourceManager.create('FORK', cfg, rcfg, None, None)
rm._launch_order = ['SRUN']
rm._launchers = {'SRUN': mocked_lm}
@@ -208,43 +216,41 @@ def test_prepare_launch_methods(self, mocked_lm, mocked_init):
mocked_lm.create.return_value = mocked_lm
rm = ResourceManager(cfg=None, log=None, prof=None)
- rm._log = rm._prof = mock.Mock()
- rm._cfg = ru.TypedDict({'pid' : None,
- 'reg_addr': None,
- 'resource_cfg': {
- 'launch_methods': {'SRUN': {}}
- }})
+ rm._log = rm._prof = mock.Mock()
+ rm._cfg = ru.TypedDict({'pid' : None,
+ 'reg_addr': None})
+ rm._rm_info = ru.TypedDict({'launch_methods': {'SRUN': {}}})
# launching order not provided
- rm._prepare_launch_methods(None)
+ rm._prepare_launch_methods()
self.assertEqual(rm._launchers['SRUN'], mocked_lm)
self.assertEqual(rm._launch_order, ['SRUN'])
# launching order provided
- rm._cfg.resource_cfg.launch_methods = {'order': ['SSH'],
- 'SRUN' : {},
- 'SSH' : {}}
- rm._prepare_launch_methods(None)
+ rm._rm_info.launch_methods = {'order': ['SSH'],
+ 'SRUN' : {},
+ 'SSH' : {}}
+ rm._prepare_launch_methods()
self.assertEqual(rm._launch_order, ['SSH'])
# launching methods not provided
- rm._cfg.resource_cfg.launch_methods = {}
+ rm._rm_info.launch_methods = {}
with self.assertRaises(RuntimeError):
- rm._prepare_launch_methods(None)
+ rm._prepare_launch_methods()
# raise exception for every launch method
def lm_raise_exception(*args, **kwargs):
raise Exception('LM Error')
- rm._cfg.resource_cfg.launch_methods = {'SRUN': {}, 'SSH': {}}
+ rm._rm_info.launch_methods = {'SRUN': {}, 'SSH': {}}
mocked_lm.create = mock.MagicMock(side_effect=lm_raise_exception)
# all LMs will be skipped, thus RuntimeError raised
with self.assertRaises(RuntimeError):
- rm._prepare_launch_methods(None)
+ rm._prepare_launch_methods()
# check that exception was logged (sign that LM exception was raised)
self.assertTrue(rm._log.exception.called)
@@ -259,9 +265,9 @@ def lm_raise_exception_once(*args, **kwargs):
raise Exception('LM Error')
return mocked_lm
- rm._cfg.resource_cfg.launch_methods = {'SRUN': {}, 'SSH': {}}
+ rm._rm_info.launch_methods = {'SRUN': {}, 'SSH': {}}
mocked_lm.create = mock.MagicMock(side_effect=lm_raise_exception_once)
- rm._prepare_launch_methods(None)
+ rm._prepare_launch_methods()
# only second LM is considered successful
self.assertEqual(rm._launch_order, ['SSH'])
self.assertEqual(len(rm._launchers), 1)
diff --git a/tests/unit_tests/test_rm/test_cases/test_cores_gpus_map.json b/tests/unit_tests/test_rm/test_cases/test_cores_gpus_map.json
index c710e43520..86d0ef337a 100644
--- a/tests/unit_tests/test_rm/test_cases/test_cores_gpus_map.json
+++ b/tests/unit_tests/test_rm/test_cases/test_cores_gpus_map.json
@@ -65,11 +65,10 @@
"cores_per_node" : 8,
"gpus_per_node" : 2,
"lfs_size_per_node" : 0,
- "resource_cfg" : {
- "mem_per_node" : 128,
- "system_architecture": {
- "blocked_cores" : []
- }}
+ "rcfg" : {
+ "mem_per_node" : 128,
+ "system_architecture": {"blocked_cores" : []}
+ }
},
{
"nodes" : 1,
@@ -78,12 +77,11 @@
"cores_per_node" : 12,
"gpus_per_node" : 2,
"lfs_size_per_node" : 0,
- "resource_cfg" : {
- "mem_per_node" : 128,
- "system_architecture": {
- "blocked_cores" : [0, 2],
- "blocked_gpus" : [1]
- }}
+ "rcfg" : {
+ "mem_per_node" : 128,
+ "system_architecture": {"blocked_cores" : [0, 2],
+ "blocked_gpus" : [1]}
+ }
},
{
# requested more NODES than allocated
@@ -93,12 +91,11 @@
"cores_per_node" : 12,
"gpus_per_node" : 2,
"lfs_size_per_node" : 0,
- "resource_cfg" : {
- "mem_per_node" : 128,
- "system_architecture": {
- "blocked_cores" : [0, 2],
- "blocked_gpus" : [1]
- }}
+ "rcfg" : {
+ "mem_per_node" : 128,
+ "system_architecture": {"blocked_cores" : [0, 2],
+ "blocked_gpus" : [1]}
+ }
},
{
# requested more CORES than allocated
@@ -108,12 +105,11 @@
"cores_per_node" : 12,
"gpus_per_node" : 2,
"lfs_size_per_node" : 0,
- "resource_cfg" : {
- "mem_per_node" : 128,
- "system_architecture": {
- "blocked_cores" : [0, 2],
- "blocked_gpus" : [1]
- }}
+ "rcfg" : {
+ "mem_per_node" : 128,
+ "system_architecture": {"blocked_cores" : [0, 2],
+ "blocked_gpus" : [1]}
+ }
}
],
"result": [
diff --git a/tests/unit_tests/test_rm/test_fork.py b/tests/unit_tests/test_rm/test_fork.py
index 7385305401..7d31c1ded9 100755
--- a/tests/unit_tests/test_rm/test_fork.py
+++ b/tests/unit_tests/test_rm/test_fork.py
@@ -26,7 +26,8 @@ def test_init_from_scratch(self, mocked_logger, mocked_mp_cpu_count,
mocked_init):
rm_fork = Fork(cfg=None, log=None, prof=None)
- rm_fork._cfg = ru.TypedDict({'resource_cfg': {}})
+ rm_fork._cfg = ru.TypedDict({'resource_cfg': {}})
+ rm_fork._rcfg = ru.TypedDict()
rm_fork._log = mocked_logger
rm_fork._cfg.resource_cfg.fake_resources = False
@@ -58,10 +59,11 @@ def test_init_from_scratch(self, mocked_logger, mocked_mp_cpu_count,
rm_fork._init_from_scratch(rm_info)
# fake/virtual resource, request more cores than available/detected
- rm_fork._cfg.resource_cfg.fake_resources = True
+ rm_fork._rcfg.fake_resources = True
rm_info.requested_nodes = 0 # will be calculated during init
rm_info.requested_cores = mocked_mp_cpu_count() * 10
+
rm_info = rm_fork._init_from_scratch(rm_info)
self.assertGreater(rm_info.requested_cores, mocked_mp_cpu_count())
self.assertGreater(rm_info.requested_nodes, 1)
diff --git a/tests/unit_tests/test_rpc.py b/tests/unit_tests/test_rpc.py
new file mode 100755
index 0000000000..28ac93503d
--- /dev/null
+++ b/tests/unit_tests/test_rpc.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+
+import sys
+import time
+
+from unittest import TestCase
+
+import radical.utils as ru
+import radical.pilot as rp
+
+
+# ------------------------------------------------------------------------------
+#
+class TestUtils(TestCase):
+
+ def __init__(self):
+
+ self._log = ru.Logger('rp', targets=['0'])
+ self._prof = ru.Profiler('rp')
+
+ self._prof.disable()
+
+ self._ctrl = ru.zmq.PubSub(rp.CONTROL_PUBSUB)
+ self._ctrl.start()
+
+ super().__init__()
+
+ time.sleep(1)
+
+
+ # --------------------------------------------------------------------------
+ #
+ def rpc_handler(self):
+
+ rpc_1 = rp.utils.RPCHelper(self._ctrl.addr_pub, self._ctrl.addr_sub,
+ self._log, self._prof)
+
+ time.sleep(1)
+
+ check = False
+ def rpc_check(val_1, val_2, val_3=3):
+ self.assertEqual(val_1, 1)
+ self.assertEqual(val_2, 2)
+ self.assertEqual(val_3, 3)
+ check = True
+ sys.stdout.write('stdout')
+ sys.stderr.write('stderr')
+ return 4
+
+ rpc_1.add_handler('check', rpc_check)
+ res = rpc_1.request('check', 1, val_2=2)
+
+ self.assertIsInstance(res.out, str)
+ self.assertIsInstance(res.err, str)
+ self.assertIsInstance(res.val, int)
+
+ self.assertEqual(str(res.out), 'stdout')
+ self.assertEqual(str(res.err), 'stderr')
+ self.assertEqual(res.val, 4)
+ self.assertEqual(res.exc, None)
+
+
+ with self.assertRaises(RuntimeError):
+ val = rpc_1.request('check', 1)
+
+
+# ------------------------------------------------------------------------------
+#
+if __name__ == '__main__':
+
+ tc = TestUtils()
+ tc.rpc_handler()
+
+
+# ------------------------------------------------------------------------------
+
diff --git a/tests/unit_tests/test_scheduler/test_base.py b/tests/unit_tests/test_scheduler/test_base.py
index bdbf5608c2..4eaec568bd 100755
--- a/tests/unit_tests/test_scheduler/test_base.py
+++ b/tests/unit_tests/test_scheduler/test_base.py
@@ -53,6 +53,9 @@ def test_initialize(self, mocked_env_eval, mocked_hostname, mocked_mp,
sched.register_subscriber = mock.Mock()
sched.nodes = []
sched._partitions = {}
+ sched._scheduler_process = False
+
+ sched._session = mock.Mock()
for c in self._test_cases['initialize']:
@@ -62,7 +65,11 @@ def _mock_get(_c, name):
from functools import partial
mock_get = partial(_mock_get, c)
- sched._cfg = ru.Config(from_dict=c['config'])
+ sched._session.cfg = ru.Config(
+ from_dict=c['config'])
+ sched._session.rcfg = ru.Config(
+ from_dict=c['config']['resource_cfg'])
+
with mock.patch.object(ru.zmq.RegistryClient, 'get', mock_get):
if 'RuntimeError' in c['result']:
with self.assertRaises(RuntimeError):
@@ -158,9 +165,12 @@ def _log_debug(*args):
sched = AgentSchedulingComponent(cfg=None, session=None)
sched._log = mock.Mock()
sched._log.debug.side_effect = _log_debug
+ sched._scheduler_process = True
sched._lock = mt.Lock()
sched._raptor_lock = mt.Lock()
+ sched._cancel_lock = mt.RLock()
+ sched._cancel_list = list()
task0000 = {}
sched._waitpool = {'task.0000': task0000}
diff --git a/tests/unit_tests/test_scheduler/test_cases/test_base.json b/tests/unit_tests/test_scheduler/test_cases/test_base.json
index a5837d3399..3263ece94d 100644
--- a/tests/unit_tests/test_scheduler/test_cases/test_base.json
+++ b/tests/unit_tests/test_scheduler/test_cases/test_base.json
@@ -4,11 +4,8 @@
{
"config": {
"pid" : "pid.0003",
- "resource_manager": "FORK",
"resource_cfg" : {
- "launch_methods" : {
- "FORK": {}
- }
+ "resource_manager": "FORK"
}
},
"registry": {
@@ -35,7 +32,10 @@
"mem_per_node" : 0,
"requested_nodes": 1,
"requested_cores": 8,
- "requested_gpus" : 0
+ "requested_gpus" : 0,
+ "launch_methods" : {
+ "FORK": {}
+ }
}
},
"result": [
diff --git a/tests/unit_tests/test_tmgr/test_cases/task.000000.json b/tests/unit_tests/test_tmgr/test_cases/task.000000.json
index 7cc0a4e372..9432cb42fd 100644
--- a/tests/unit_tests/test_tmgr/test_cases/task.000000.json
+++ b/tests/unit_tests/test_tmgr/test_cases/task.000000.json
@@ -1,6 +1,7 @@
{
"task": {
+ "pilot" : "pilot.0000",
"uid" : "task.000000",
"description": {
"input_staging": [
diff --git a/tests/unit_tests/test_tmgr/test_tmgr.py b/tests/unit_tests/test_tmgr/test_tmgr.py
index 4aeca8327e..1a61c6e095 100755
--- a/tests/unit_tests/test_tmgr/test_tmgr.py
+++ b/tests/unit_tests/test_tmgr/test_tmgr.py
@@ -46,7 +46,6 @@ def test_add_pilots(self, mocked_logger, mocked_init):
global_pilots = []
def publish_side_effect(rpc, pilot):
- print(type(pilot), pilot)
nonlocal global_pilots
global_pilots.append(pilot)
diff --git a/tests/unit_tests/test_tmgr/test_tmgr_staging.py b/tests/unit_tests/test_tmgr/test_tmgr_staging.py
index 843df23d28..a5c60e2f58 100644
--- a/tests/unit_tests/test_tmgr/test_tmgr_staging.py
+++ b/tests/unit_tests/test_tmgr/test_tmgr_staging.py
@@ -53,8 +53,12 @@ def test_si_create(self, mocked_component_init):
def test_si_work(self, mocked_si_init):
tmgr_si = StageInDefault(cfg={}, session=None)
+ tmgr_si._log = mock.Mock()
+ tmgr_si._session_sbox = '/tmp'
- def _mocked_advance(things, state, publish, push):
+ def _mocked_advance(things, state, publish, push, qname=None):
+ if not things:
+ return
nonlocal global_things
nonlocal global_state
global_things.append(things)
@@ -77,13 +81,13 @@ def _mocked_handle_task(task, actionables):
if not tc.get('task'):
continue
- tmgr_si.work(dict(tc['task']))
+ tmgr_si.work([dict(tc['task'])])
for tasks in global_things:
# there were only one task per call
self.assertEqual(tasks[0]['control'], 'tmgr')
# advanced is called 2 times for the provided inputs
- self.assertEqual(len(global_things), 2)
+ self.assertEqual(2, len(global_things))
self.assertEqual(global_state, [rps.TMGR_STAGING_INPUT, rps.FAILED])
# ------------------------------------------------------------------------------
diff --git a/tests/unit_tests/test_utils.py b/tests/unit_tests/test_utils.py
index d5c6657e0d..c084fc000c 100755
--- a/tests/unit_tests/test_utils.py
+++ b/tests/unit_tests/test_utils.py
@@ -199,30 +199,30 @@ def test_resource_cfg(self):
rfs_url = rpu_misc.get_resource_fs_url('local.localhost')
self.assertIsInstance(rfs_url, ru.Url)
- self.assertEqual(str(rfs_url), rcfg_local.local.filesystem_endpoint)
+ self.assertEqual(str(rfs_url), rcfg_local.schemas.local.filesystem_endpoint)
# switched default access schema, which is the first in the list
- rpu_misc._rcfgs.local.localhost.schemas = ['ssh', 'local']
+ rpu_misc._rcfgs.local.localhost.default_schema = 'ssh'
rfs_url = rpu_misc.get_resource_fs_url('local.localhost')
- self.assertEqual(str(rfs_url), rcfg_local.ssh.filesystem_endpoint)
+ self.assertEqual(str(rfs_url), rcfg_local.schemas.ssh.filesystem_endpoint)
rfs_url = rpu_misc.get_resource_fs_url(resource='access.bridges2',
schema='gsissh')
self.assertEqual(str(rfs_url),
- rcfgs.access.bridges2.gsissh.filesystem_endpoint)
+ rcfgs.access.bridges2.schemas.gsissh.filesystem_endpoint)
# test resource job URL
rj_url = rpu_misc.get_resource_job_url('local.localhost')
self.assertIsInstance(rj_url, ru.Url)
- schema_default = rpu_misc._rcfgs.local.localhost.schemas[0]
+ schema_default = rpu_misc._rcfgs.local.localhost.default_schema
self.assertEqual(str(rj_url),
- rcfg_local[schema_default].job_manager_endpoint)
+ rcfg_local.schemas[schema_default].job_manager_endpoint)
rj_url = rpu_misc.get_resource_job_url(resource='access.bridges2',
schema='gsissh')
self.assertEqual(str(rj_url),
- rcfgs.access.bridges2.gsissh.job_manager_endpoint)
+ rcfgs.access.bridges2.schemas.gsissh.job_manager_endpoint)
# ------------------------------------------------------------------------------