Skip to content

Commit

Permalink
add mpirun mode
Browse files Browse the repository at this point in the history
  • Loading branch information
caozhou committed Jun 5, 2024
1 parent d9e80ad commit 4e7b289
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 4 deletions.
4 changes: 3 additions & 1 deletion flagscale/auto_tuner/tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,9 @@ def __init__(self, config: DictConfig):
# Set platform envs
if "platform" not in self.config.auto_tuner:
self.config.auto_tuner.platform = {}
if os.environ.get("AIRS_SWITCH", None) != "False":

# As long as AIRS_SWITCH has value it means running on the platform
if os.environ.get("AIRS_SWITCH", None):
self.config.auto_tuner.platform.airs_switch = True

if os.environ.get("AIRS_SIZE", None):
Expand Down
65 changes: 65 additions & 0 deletions flagscale/auto_tuner/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
import re
import os
import socket
import subprocess
from flagscale.launcher.runner import parse_hostfile


def divisible(x, y):
if x % y == 0:
return True
Expand Down Expand Up @@ -25,6 +32,7 @@ def beside(keys, strategy, history):


def sort_by_memory(strategy):
"""Sort strategy by memory."""
return (
-strategy["use_recompute"],
-strategy["tensor_model_parallel_size"],
Expand All @@ -45,6 +53,7 @@ def sort_by_memory(strategy):


def sort_by_performance(strategy):
"""Sort strategy by performance potentially."""
return (
strategy["use_recompute"],
-strategy["tensor_model_parallel_size"],
Expand All @@ -62,3 +71,59 @@ def sort_by_performance(strategy):
else float("inf")
),
)


def is_ip_addr(master):
"""Check if master is ip address."""

if not isinstance(master, str):
return False
pattern = r"^((25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)\.){3}(25[0-5]|2[0-4]\d|1\d{2}|[1-9]?\d)$"
result = re.match(pattern, master)
if result:
return True
else:
return False

def get_ip_addr():
"""Get ip address."""
try:
hostname = socket.gethostname()
ip = socket.gethostbyname(socket.getfqdn(hostname))
except:
ip = '127.0.0.1'
return ip


def is_master(config):
"""Check if current node is master."""
multi_nodes = False
if config.experiment.runner.get("nnodes", 1) > 1:
multi_nodes = True

hostfile = None
if config.experiment.runner.get("hostfile", None):
hostfile = config.experiment.runner["hostfile"]
if os.environ.get("AIRS_SWITCH", None):
if os.environ.get("AIRS_HOSTFILE_PATH", None):
hostfile = os.environ["AIRS_HOSTFILE_PATH"]
resources = parse_hostfile(hostfile)

if not resources and multi_nodes:
raise ValueError("In the multi-node mode, please set the hostfile")

if not resources and not multi_nodes:
return True

if resources and multi_nodes:
master = resources.keys()[0]
if is_ip_addr(master):
return get_ip_addr() == master
else:
output = subprocess.run("hostname",
check=True,
shell=True,
text=True,
capture_output=True)
hostname = output.stdout.strip()
return hostname == master
9 changes: 6 additions & 3 deletions run.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@
def main(config: DictConfig) -> None:
if config.action == "auto_tune":
from flagscale.auto_tuner import AutoTuner

tuner = AutoTuner(config)
tuner.tune()
# For MPIRUN scene, just one autotuner process.
# NOTE: This is a temporary solution and will be updated with cloud runner.
from flagscale.auto_tuner.utils import is_master
if is_master(config):
tuner = AutoTuner(config)
tuner.tune()
else:
if config.experiment.runner.get("type", "ssh") == "ssh":
runner = SSHRunner(config)
Expand Down

0 comments on commit 4e7b289

Please sign in to comment.