Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make Launch Containerization API Based #2080

Draft
wants to merge 17 commits into
base: dev/v0.7.0
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions devops/dockerfile/fedml-launch/job/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Image Name: fedml/fedml-launch-job:cu11.6
ARG BASE_IMAGE=fedml/fedml:latest-torch1.13.1-cuda11.6-cudnn8-devel
FROM ${BASE_IMAGE}

ARG HOME_DIR=/home/fedml
## Only Modify if you want to use a different version of FedML
RUN pip3 install -U fedml
## Only Uncomment if you want to use a local version of FedML
# RUN mkdir -p ${HOME_DIR}/fedml-pip
# COPY ./python ${HOME_DIR}/fedml-pip
# WORKDIR ${HOME_DIR}/fedml-pip
# RUN pip3 install ./

# 2. MOUNT User's Local Folder (If any)
ENV DATA_FOLDER=""
VOLUME [ DATA_FOLDER ]

# 3. MOUNT Fedml Home Folder
VOLUME ["/home/fedml/launch"]

# 4. Enter the bootstrap and job entrypoints
WORKDIR ${HOME_DIR}/launch
ENV BOOTSTRAP_SCRIPT="fedml_bootstrap_generated.sh"
ENV MAIN_ENTRY="fedml_job_entry_pack.sh"

# Run Bootstrap Script and Main Entry File
CMD /bin/bash ${BOOTSTRAP_SCRIPT}; /bin/bash ${MAIN_ENTRY}
2 changes: 1 addition & 1 deletion python/fedml/computing/scheduler/comm_utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ class SchedulerConstants:
RUN_PROCESS_TYPE_BOOTSTRAP_PROCESS = "bootstrap-process"

FEDML_DEFAULT_LAUNCH_CONTAINER_PREFIX = "fedml_default_launch_container"
FEDML_DEFAULT_LAUNCH_IMAGE = "fedml/fedml-default-launch:cu12.1-u22.04"
FEDML_DEFAULT_LAUNCH_IMAGE = "fedml/fedml-launch-job:cu11.6"
FEDML_DEFAULT_LOG_DIR = ".fedml/fedml-client/fedml/logs"
FEDML_DEFAULT_DATA_DIR = ".fedml/fedml-client/fedml/data"

Expand Down
93 changes: 75 additions & 18 deletions python/fedml/computing/scheduler/comm_utils/job_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,51 @@
from fedml.computing.scheduler.slave.client_constants import ClientConstants
from fedml.computing.scheduler.comm_utils.sys_utils import get_python_program
from fedml.computing.scheduler.scheduler_core.compute_cache_manager import ComputeCacheManager
from ..scheduler_entry.constants import Constants
from dataclasses import dataclass, field, fields

from fedml.computing.scheduler.slave.client_data_interface import FedMLClientDataInterface
from fedml.core.common.singleton import Singleton
from fedml.computing.scheduler.comm_utils.container_utils import ContainerUtils
from typing import List
from typing import List, Dict, Any, Optional
import threading
import json

run_docker_without_gpu = False


@dataclass
class JobArgs:
request_json: Dict[str, Any]
conf_file_object: Any
fedml_config_object: field(default_factory=dict)
client_rank: Optional[int] = None

def __post_init__(self):
self.run_config = self.request_json.get("run_config", {})
self.run_params = self.run_config.get("parameters", {})
self.client_rank = self.request_json.get("client_rank", 1) if self.client_rank is None else self.client_rank
self.job_yaml = self.run_params.get("job_yaml", {})
self.job_yaml_default_none = self.run_params.get("job_yaml", None)
self.job_api_key = self.run_params.get("job_api_key", None)
self.job_api_key = self.job_yaml.get("fedml_run_dynamic_params", None) if self.job_api_key is None else self.job_api_key
self.assigned_gpu_ids = self.run_params.get("gpu_ids", None)
self.job_type = self.job_yaml.get("job_type", None)
# TODO: Can we remove task_type?
self.job_type = self.job_yaml.get("task_type", Constants.JOB_TASK_TYPE_TRAIN) if self.job_type is None else self.job_type
self.containerize = self.fedml_config_object.get("containerize", None)
self.image_pull_policy = self.fedml_config_object.get("image_pull_policy", Constants.IMAGE_PULL_POLICY_ALWAYS)
self.entry_args_dict = self.conf_file_object.get("fedml_entry_args", {})
self.entry_args = self.entry_args_dict.get("entry_args", None)
self.scheduler_match_info = self.request_json.get("scheduler_match_info", {})
self.env_args = self.fedml_config_object.get("environment_args", None)
self.docker_args = JobRunnerUtils.create_instance_from_dict(DockerArgs,
self.fedml_config_object.get("docker", {}))
self.executable_interpreter = ClientConstants.CLIENT_SHELL_PS \
if platform.system() == ClientConstants.PLATFORM_WINDOWS else ClientConstants.CLIENT_SHELL_BASH
self.framework_type = self.job_yaml.get("framework_type", None)


@dataclass
class DockerArgs:
image: str = SchedulerConstants.FEDML_DEFAULT_LAUNCH_IMAGE
Expand All @@ -33,6 +66,9 @@ class DockerArgs:
registry: str = ""
ports: List[int] = field(default_factory=lambda: [2345])

def __post_init__(self):
self.client = JobRunnerUtils.get_docker_client(self)


class JobRunnerUtils(Singleton):
STATIC_RUN_LOCK_KEY_SUFFIX = "STATIC"
Expand Down Expand Up @@ -393,23 +429,44 @@ def create_instance_from_dict(data_class, input_dict: {}):
return instance

@staticmethod
def generate_bootstrap_commands(bootstrap_script_path, bootstrap_script_dir, bootstrap_script_file):
if os.path.exists(bootstrap_script_path):
bootstrap_stat = os.stat(bootstrap_script_path)
if platform.system() == 'Windows':
os.chmod(bootstrap_script_path,
bootstrap_stat.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
bootstrap_scripts = "{}".format(bootstrap_script_path)
else:
os.chmod(bootstrap_script_path,
bootstrap_stat.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
bootstrap_scripts = "cd {}; ./{}".format(
bootstrap_script_dir, os.path.basename(bootstrap_script_file))

bootstrap_scripts = str(bootstrap_scripts).replace('\\', os.sep).replace('/', os.sep)
shell_cmd_list = list()
shell_cmd_list.append(bootstrap_scripts)
return shell_cmd_list
def generate_bootstrap_commands(env_args, unzip_package_path) -> (List[str], str):
bootstrap_cmd_list = list()
bootstrap_script_path, bootstrap_script_dir, bootstrap_script_file = [None] * 3

if env_args is not None:
bootstrap_script_file = env_args.get("bootstrap", None)
if bootstrap_script_file is not None:
bootstrap_script_file = str(bootstrap_script_file).replace('\\', os.sep).replace('/', os.sep)
if platform.system() == 'Windows':
bootstrap_script_file = bootstrap_script_file.rstrip('.sh') + '.bat'
if bootstrap_script_file is not None:
bootstrap_script_dir = os.path.join(unzip_package_path, "fedml",
os.path.dirname(bootstrap_script_file))
bootstrap_script_path = os.path.join(
bootstrap_script_dir, bootstrap_script_dir, os.path.basename(bootstrap_script_file)
)

if bootstrap_script_path:
logging.info("Bootstrap commands are being generated...")
if os.path.exists(bootstrap_script_path):
bootstrap_stat = os.stat(bootstrap_script_path)
if platform.system() == 'Windows':
os.chmod(bootstrap_script_path,
bootstrap_stat.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
bootstrap_scripts = "{}".format(bootstrap_script_path)
else:
os.chmod(bootstrap_script_path,
bootstrap_stat.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
bootstrap_scripts = "cd {}; ./{}".format(
bootstrap_script_dir, os.path.basename(bootstrap_script_file))

bootstrap_scripts = str(bootstrap_scripts).replace('\\', os.sep).replace('/', os.sep)
bootstrap_cmd_list.append(bootstrap_scripts)
if len(bootstrap_cmd_list):
logging.info(f"Generated following Bootstrap commands: {bootstrap_cmd_list}")
else:
logging.info("No Bootstrap commands generated.")
return bootstrap_cmd_list, bootstrap_script_file

@staticmethod
def generate_job_execute_commands(run_id, edge_id, version,
Expand Down
Loading
Loading