Skip to content

Commit

Permalink
chore: gpu base docker image (#227)
Browse files Browse the repository at this point in the history
Signed-off-by: ThibaultFy <[email protected]>
Signed-off-by: SdgJlbl <[email protected]>
Signed-off-by: Guilhem Barthés <[email protected]>
Co-authored-by: SdgJlbl <[email protected]>
Co-authored-by: Guilhem Barthés <[email protected]>
  • Loading branch information
3 people authored Aug 27, 2024
1 parent 83f5c65 commit a7b598c
Show file tree
Hide file tree
Showing 13 changed files with 106 additions and 34 deletions.
1 change: 1 addition & 0 deletions changes/227.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add Docker GPU base image, activated through the `Dependency` object with the variable `use_gpu=True`. The Docker image used is `nvidia/cuda:11.8.0-runtime-ubuntu22.04`.
1 change: 1 addition & 0 deletions changes/242.changed
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Remove packages named `build-essential` and `*-dev` after building dependencies to decrease CVE
23 changes: 11 additions & 12 deletions substrafl/algorithms/pytorch/torch_base_algo.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,7 @@ def __init__(
np.random.seed(seed)
torch.manual_seed(seed)

self._device = self._get_torch_device(disable_gpu=disable_gpu)

self.disable_gpu = disable_gpu
self._model = model.to(self._device)
self._optimizer = optimizer
# Move the optimizer to GPU if needed
Expand Down Expand Up @@ -212,18 +211,16 @@ def _local_train(
if self._scheduler is not None:
self._scheduler.step()

def _get_torch_device(self, disable_gpu: bool) -> torch.device:
@property
def _device(self) -> torch.device:
"""Get the torch device, CPU or GPU, depending
on availability and user input.
Args:
disable_gpu (bool): whether to use GPUs if available or not.
Returns:
torch.device: Torch device
"""
device = torch.device("cpu")
if not disable_gpu and torch.cuda.is_available():
if not self.disable_gpu and torch.cuda.is_available():
device = torch.device("cuda")
return device

Expand All @@ -249,8 +246,11 @@ def _update_from_checkpoint(self, path: Path) -> dict:
return checkpoint
"""
assert path.is_file(), f'Cannot load the model - does not exist {list(path.parent.glob("*"))}'
checkpoint = torch.load(path, map_location=self._device)
checkpoint = torch.load(path) # TO CHANGE
self.disable_gpu = checkpoint.pop("disable_gpu")

self._model.load_state_dict(checkpoint.pop("model_state_dict"))
self._model.to(self._device)

if self._optimizer is not None:
self._optimizer.load_state_dict(checkpoint.pop("optimizer_state_dict"))
Expand Down Expand Up @@ -307,17 +307,16 @@ def _get_state_to_save(self) -> dict:
checkpoint = {
"model_state_dict": self._model.state_dict(),
"index_generator": self._index_generator,
"disable_gpu": self.disable_gpu,
"random_rng_state": random.getstate(),
"numpy_rng_state": np.random.get_state(),
}
if self._optimizer is not None:
checkpoint["optimizer_state_dict"] = self._optimizer.state_dict()

if self._scheduler is not None:
checkpoint["scheduler_state_dict"] = self._scheduler.state_dict()

checkpoint["random_rng_state"] = random.getstate()

checkpoint["numpy_rng_state"] = np.random.get_state()

if self._device == torch.device("cpu"):
checkpoint["torch_rng_state"] = torch.get_rng_state()
else:
Expand Down
3 changes: 3 additions & 0 deletions substrafl/dependency/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ class Dependency(BaseModel):
force_included_paths (List[pathlib.Path]): Force include files otherwise excluded by `excluded_paths`
and `excluded_regex`
Default to []
use_gpu (bool): Use nvidia docker image with cuda driver. Allow docker image to access GPU. The docker image
will be longer to build. It is recommended to pass ``use_gpu`` to ``True`` only if you want to use GPUs.
"""

editable_mode: bool = False
Expand All @@ -65,6 +67,7 @@ class Dependency(BaseModel):
excluded_paths: List[Path] = Field(default_factory=list)
excluded_regex: List[str] = Field(default_factory=list)
force_included_paths: List[Path] = Field(default_factory=list)
use_gpu: bool = False
_wheels: List[Path] = []
_local_paths: List[Path] = []
_cache_directory: Optional[Path] = None
Expand Down
1 change: 1 addition & 0 deletions substrafl/remote/operations.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Dataclasses describing the operations
to execute on the remote.
"""

from dataclasses import dataclass
from typing import Any
from typing import List
Expand Down
58 changes: 42 additions & 16 deletions substrafl/remote/register/register.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,21 +29,38 @@
MINIMAL_PYTHON_VERSION = 9 # 3.9
MAXIMAL_PYTHON_VERSION = 12 # 3.12

_DEFAULT_BASE_DOCKER_IMAGE = "python:{python_version}-slim"

DOCKERFILE_TEMPLATE = """
FROM {docker_image}
_CPU_BASE_IMAGE = """
FROM python:{python_version}-slim
# update image
RUN apt update -y
RUN apt-get update -y
"""

_GPU_BASE_IMAGE = """
FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
# update image & install Python
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update -y\
&& apt-get install -y software-properties-common\
&& add-apt-repository -y ppa:deadsnakes/ppa\
&& apt-get -y upgrade\
&& apt-get install -y python{python_version} python{python_version}-venv python3-pip\
&& apt-get clean\
&& rm -rf /var/lib/apt/lists/*
"""

DOCKERFILE_TEMPLATE = """{base_docker_image}
# create a non-root user
RUN addgroup --gid 1001 group
RUN adduser --disabled-password --gecos "" --uid 1001 --gid 1001 --home /home/user user
ENV PYTHONPATH /home/user
WORKDIR /home/user
USER user
RUN python{python_version} -m venv /home/user/venv
ENV PATH="/home/user/venv/bin:$PATH" VIRTUAL_ENV="/home/user/venv"
# install dependencies
RUN python{python_version} -m pip install -U pip
Expand All @@ -56,6 +73,10 @@
# Install requirements
RUN python{python_version} -m pip install --no-cache-dir -r requirements.txt
USER root
RUN apt-get purge -y --auto-remove build-essential *-dev
USER user
# Copy all other files
COPY function.py .
COPY {internal_dir}/cls_cloudpickle {internal_dir}/
Expand Down Expand Up @@ -110,15 +131,19 @@ def _check_python_version(python_major_minor: str) -> None:
)


def _get_base_docker_image(python_major_minor: str, editable_mode: bool) -> str:
def _get_base_docker_image(python_major_minor: str, use_gpu: bool) -> str:
"""Get the base Docker image for the Dockerfile"""
_check_python_version(python_major_minor)

substratools_image = _DEFAULT_BASE_DOCKER_IMAGE.format(
python_version=python_major_minor,
)
if use_gpu:
base_docker_image = _GPU_BASE_IMAGE.format(
python_version=python_major_minor,
)
else:
base_docker_image = _CPU_BASE_IMAGE.format(
python_version=python_major_minor,
)

return substratools_image
return base_docker_image


def _generate_copy_local_files(local_files: typing.List[Path]) -> str:
Expand All @@ -132,10 +157,11 @@ def _create_dockerfile(install_libraries: bool, dependencies: Dependency, operat
# Cloudpickle will crash if we don't deserialize with the same major.minor
python_major_minor = ".".join(python_version().split(".")[:2])

# check that the Python version is supported
_check_python_version(python_major_minor)

# Get the base Docker image
substratools_image = _get_base_docker_image(
python_major_minor=python_major_minor, editable_mode=dependencies.editable_mode
)
base_docker_image = _get_base_docker_image(python_major_minor=python_major_minor, use_gpu=dependencies.use_gpu)
# Build Substrafl, Substra and Substratools, and local dependencies wheels if necessary
if install_libraries:
# generate the copy wheel command
Expand All @@ -148,7 +174,7 @@ def _create_dockerfile(install_libraries: bool, dependencies: Dependency, operat
copy_local_code_cmd = _generate_copy_local_files(dependencies._local_paths)

return DOCKERFILE_TEMPLATE.format(
docker_image=substratools_image,
base_docker_image=base_docker_image,
python_version=python_major_minor,
copy_wheels=copy_wheels_cmd,
copy_local_code=copy_local_code_cmd,
Expand Down
1 change: 1 addition & 0 deletions substrafl/remote/serializers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Serializers to save the user code and wrap it in the Substra algo code.
"""

from substrafl.remote.serializers.pickle_serializer import PickleSerializer
from substrafl.remote.serializers.serializer import Serializer

Expand Down
1 change: 1 addition & 0 deletions substrafl/strategies/schemas.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Schemas used in the strategies.
"""

from enum import Enum
from typing import List

Expand Down
1 change: 1 addition & 0 deletions tests/dependency/installable_library/setup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Packaging settings."""

from setuptools import setup

setup(
Expand Down
1 change: 1 addition & 0 deletions tests/dependency/installable_library2/setup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Packaging settings."""

from setuptools import setup

setup(
Expand Down
1 change: 1 addition & 0 deletions tests/installable_library/setup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Packaging settings."""

from setuptools import find_packages
from setuptools import setup

Expand Down
47 changes: 41 additions & 6 deletions tests/remote/register/test_register.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from substrafl.remote.decorators import remote_data
from substrafl.remote.register import register
from substrafl.remote.register.register import _create_dockerfile
from substrafl.remote.register.register import _get_base_docker_image


class RemoteClass:
Expand All @@ -37,14 +38,41 @@ def test_check_python_version(version):
register._check_python_version(version)


@pytest.mark.parametrize("version", ["3.9", "3.10", "3.11"])
@pytest.mark.parametrize("version", ["3.9", "3.10", "3.11", "3.12"])
def test_check_python_version_valid(version):
"""Does not raise for supported versions"""
register._check_python_version(version)


def test_create_dockerfile(tmp_path, mocker, local_installable_module):
mocker.patch("substrafl.remote.register.register._get_base_docker_image", return_value="substratools-mocked")
def test_get_base_docker_image_cpu():
expected_dockerfile = """
FROM python:3.12-slim
# update image
RUN apt-get update -y
"""
assert expected_dockerfile == _get_base_docker_image("3.12", use_gpu=False)


def test_get_base_docker_image_gpu():
expected_dockerfile = """
FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
# update image & install Python
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update -y\
&& apt-get install -y software-properties-common\
&& add-apt-repository -y ppa:deadsnakes/ppa\
&& apt-get -y upgrade\
&& apt-get install -y python3.11 python3.11-venv python3-pip\
&& apt-get clean\
&& rm -rf /var/lib/apt/lists/*
"""
assert expected_dockerfile == _get_base_docker_image("3.11", use_gpu=True)


def test_create_dockerfile(tmp_path, local_installable_module):
python_version = f"{sys.version_info.major}.{sys.version_info.minor}"
substrafl_wheel = f"substrafl_internal/dist/substrafl-{substrafl.__version__}-py3-none-any.whl"
substra_wheel = f"substrafl_internal/dist/substra-{substra.__version__}-py3-none-any.whl"
Expand All @@ -61,22 +89,25 @@ def test_create_dockerfile(tmp_path, mocker, local_installable_module):
pypi_dependencies=[],
local_installable_dependencies=[local_installable_dependencies],
local_code=[local_code_folder],
use_gpu=False,
)
dependencies._compute_in_cache_directory

expected_dockerfile = f"""
FROM substratools-mocked
FROM python:{python_version}-slim
# update image
RUN apt update -y
RUN apt-get update -y
# create a non-root user
RUN addgroup --gid 1001 group
RUN adduser --disabled-password --gecos "" --uid 1001 --gid 1001 --home /home/user user
ENV PYTHONPATH /home/user
WORKDIR /home/user
USER user
RUN python{python_version} -m venv /home/user/venv
ENV PATH="/home/user/venv/bin:$PATH" VIRTUAL_ENV="/home/user/venv"
# install dependencies
RUN python{python_version} -m pip install -U pip
Expand All @@ -92,6 +123,10 @@ def test_create_dockerfile(tmp_path, mocker, local_installable_module):
# Install requirements
RUN python{python_version} -m pip install --no-cache-dir -r requirements.txt
USER root
RUN apt-get purge -y --auto-remove build-essential *-dev
USER user
# Copy all other files
COPY function.py .
COPY substrafl_internal/cls_cloudpickle substrafl_internal/
Expand Down
1 change: 1 addition & 0 deletions tests/settings.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Global settings for all tests environment."""

import functools
from pathlib import Path
from typing import List
Expand Down

0 comments on commit a7b598c

Please sign in to comment.