Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Another attempt at fixing this weird issue #156

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
default_stages:
- pre-commit
repos:
- repo: https://github.com/commitizen-tools/commitizen
rev: v4.2.1
hooks:
- id: commitizen
stages: [commit-msg]
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.15.0 # Use the sha / tag you want to point at
hooks:
Expand Down Expand Up @@ -37,3 +34,8 @@ repos:
- id: "validate-cff"
args:
- "--verbose"
- repo: https://github.com/commitizen-tools/commitizen
rev: v4.2.1
hooks:
- id: commitizen
stages: [commit-msg]
18 changes: 18 additions & 0 deletions perun/backend/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,19 @@
"""Backend module."""

from typing import Dict, Type

from .backend import Backend
from .nvml import NVMLBackend
from .powercap_rapl import PowercapRAPLBackend
from .psutil import PSUTILBackend
from .rocmsmi import ROCMBackend
from .util import getBackendMetadata, getHostMetadata

available_backends: Dict[str, Type[Backend]] = {
"NVMLBackend": NVMLBackend,
"PowercapRAPLBackend": PowercapRAPLBackend,
"PSUTILBackend": PSUTILBackend,
"ROCMBackend": ROCMBackend,
}

__all__ = ["getBackendMetadata", "getHostMetadata"]
5 changes: 5 additions & 0 deletions perun/backend/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ def __init__(self) -> None:
self._metadata: Dict = {}
log.info(f"Initialized {self.name} backend")

def __del__(self):
"""Backend cleanup method."""
log.debug("Deleting backend.")
self.close()

@property
def metadata(self) -> Dict:
"""Return backend metadata."""
Expand Down
6 changes: 5 additions & 1 deletion perun/backend/nvml.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,11 @@

def close(self):
"""Backend shutdown code."""
self.pynvml.nvmlShutdown()
if hasattr(self, "pynvml"):
try:
self.pynvml.nvmlShutdown()
except Exception as e:
log.warning(e)

Check warning on line 51 in perun/backend/nvml.py

View check run for this annotation

Codecov / codecov/patch

perun/backend/nvml.py#L48-L51

Added lines #L48 - L51 were not covered by tests

def availableSensors(self) -> Dict[str, Tuple]:
"""Return string ids of visible devices.
Expand Down
3 changes: 2 additions & 1 deletion perun/backend/rocmsmi.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@

def close(self):
"""Backend cleanup."""
self.amdsmi.amdsmi_shut_down()
if hasattr(self, "amdsmi"):
self.amdsmi.amdsmi_shut_down()

Check warning on line 48 in perun/backend/rocmsmi.py

View check run for this annotation

Codecov / codecov/patch

perun/backend/rocmsmi.py#L48

Added line #L48 was not covered by tests

def availableSensors(self) -> Dict[str, Tuple]:
"""Return string ids of visible devices.
Expand Down
31 changes: 11 additions & 20 deletions perun/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,15 @@
from configparser import ConfigParser
from datetime import datetime
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple, Type
from typing import Any, Callable, Dict, List, Optional, Tuple

from perun import __version__
from perun.backend.backend import Backend
from perun.backend.nvml import NVMLBackend
from perun.backend.powercap_rapl import PowercapRAPLBackend
from perun.backend.psutil import PSUTILBackend
from perun.backend.rocmsmi import ROCMBackend
from perun.backend.util import getBackendMetadata, getHostMetadata
from perun.backend import (
Backend,
available_backends,
getBackendMetadata,
getHostMetadata,
)
from perun.comm import Comm
from perun.configuration import sanitize_config
from perun.coordination import assignSensors, getHostRankDict
Expand Down Expand Up @@ -104,15 +104,9 @@ def backends(self) -> Dict[str, Backend]:
"""
if not self._backends:
self._backends = {}
classList: Dict[str, Type[Backend]] = {
"PowercapRAPL": PowercapRAPLBackend,
"NVML": NVMLBackend,
"PSUTIL": PSUTILBackend,
"ROCM": ROCMBackend,
}
for name, backend in classList.items():
for name, backend_class in available_backends.items():
try:
backend_instance = backend()
backend_instance = backend_class()
self._backends[backend_instance.id] = backend_instance
except ImportError as ie:
log.info(f"Missing dependencies for backend {name}")
Expand Down Expand Up @@ -358,11 +352,6 @@ def monitor_application(
log.error(
f"Rank {self.comm.Get_rank()}: Failed to start run {i}, saving previous runs (if any), and exiting."
)
self._monitor.status = MonitorStatus.PROCESSING
# Ideally this should just retry to run the application again, hopping for the perunSubprocess to work, but this is not working as expected, because of heat's incrementalSVD, so we will just exit out of the loop for now. This should be fixed in the future.
# This should still save the data from the previous run, so it should be fine.

# continue
break

if self.comm.Get_rank() == 0 and runNode:
Expand All @@ -376,6 +365,8 @@ def monitor_application(

i += 1

self._monitor.close()

# Get app node data if it exists
if self.comm.Get_rank() == 0 and len(multirun_nodes) > 0:
multirun_node = self._process_multirun(multirun_nodes)
Expand Down
Loading