Skip to content

Commit

Permalink
update runner to monitor job
Browse files Browse the repository at this point in the history
  • Loading branch information
caozhou committed Jun 6, 2024
1 parent d9af983 commit e5e9c7d
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 2 deletions.
2 changes: 1 addition & 1 deletion flagscale/auto_tuner/tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ def tune(self):
self.orig_config)
best_task.action = "run"
runner = SSHRunner(best_task)
runner.run()
runner.run(monitor=True)

def need_stop(self):
"""Judge whether need to stop tuning."""
Expand Down
17 changes: 16 additions & 1 deletion flagscale/launcher/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,6 +465,7 @@ def stop(self):


class SSHRunner(MultiNodeRunner):

def __init__(self, config: DictConfig):
self.config = config
_update_config(self.config)
Expand Down Expand Up @@ -538,7 +539,7 @@ def _run_each(
else:
run_local_command(f"bash {host_run_script_file}", dryrun)

def run(self, with_test=False, dryrun=False):
def run(self, with_test=False, dryrun=False, monitor=False):
self._prepare()
logger.info("\n************** configuration ***********")
logger.info(f"\n{OmegaConf.to_yaml(self.config)}")
Expand Down Expand Up @@ -596,6 +597,20 @@ def run(self, with_test=False, dryrun=False):
with_test=with_test,
dryrun=dryrun,
)
# If need monitor, query status continually
if monitor:
# sleep 10s to wait task already started
time.sleep(10)
try:
while True:
status = self._query_status()
logger.info(f"Job Status: {status.name}")
if status == JobStatus.COMPLETED_OR_IDLE:
break
time.sleep(10)
logger.info("Job Ended.")
except Exception as e:
logger.info(e)

def _stop_each(self, host, node_rank):
host_stop_script_file = _generate_stop_script(self.config, host, node_rank)
Expand Down

0 comments on commit e5e9c7d

Please sign in to comment.