Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MPI example #270

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
2 changes: 1 addition & 1 deletion .github/workflows/test_suite_ubuntu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ jobs:
- name: Install an MPI distribution
run: |
sudo apt update
sudo apt install mpich
sudo apt install openmpi-bin openmpi-common libopenmpi-dev

- name: Install pFUnit
run: |
Expand Down
66 changes: 66 additions & 0 deletions examples/7_MPI/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
cmake_minimum_required(VERSION 3.15...3.31)
# policy CMP0076 - target_sources source files are relative to file where
# target_sources is run
cmake_policy(SET CMP0076 NEW)

set(PROJECT_NAME MPIExample)

project(${PROJECT_NAME} LANGUAGES Fortran)

# Build in Debug mode if not specified
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE
Debug
CACHE STRING "" FORCE)
endif()

find_package(FTorch)
find_package(MPI REQUIRED)
message(STATUS "Building with Fortran PyTorch coupling")

# Fortran example
add_executable(mpi_infer_fortran mpi_infer_fortran.f90)
target_link_libraries(mpi_infer_fortran PRIVATE FTorch::ftorch)
target_link_libraries(mpi_infer_fortran PRIVATE MPI::MPI_Fortran)

# Integration testing
if(CMAKE_BUILD_TESTS)
include(CTest)

# 1. Check the PyTorch model runs and its outputs meet expectations
add_test(NAME simplenet COMMAND ${Python_EXECUTABLE}
${PROJECT_SOURCE_DIR}/simplenet.py)

# 2. Check the model is saved to file in the expected location with the
# pt2ts.py script
add_test(
NAME pt2ts
COMMAND ${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/pt2ts.py --filepath
${PROJECT_BINARY_DIR}
WORKING_DIRECTORY ${PROJECT_BINARY_DIR})

# 3. Check the model can be loaded from file and run with MPI in Python and
# that its outputs meet expectations
add_test(
NAME mpi_infer_python
COMMAND
${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 2 ${Python_EXECUTABLE}
${PROJECT_SOURCE_DIR}/mpi_infer_python.py --filepath ${PROJECT_BINARY_DIR}
WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
set_tests_properties(
mpi_infer_python PROPERTIES PASS_REGULAR_EXPRESSION
"MPI Python example ran successfully")

# 4. Check the model can be loaded from file and run with MPI in Fortran and
# that its outputs meet expectations
add_test(
NAME mpi_infer_fortran
COMMAND
${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} 2 ./mpi_infer_fortran
${PROJECT_BINARY_DIR}/saved_simplenet_model_cpu.pt
# Command line argument: model file
WORKING_DIRECTORY ${PROJECT_BINARY_DIR})
set_tests_properties(
mpi_infer_fortran PROPERTIES PASS_REGULAR_EXPRESSION
"MPI Fortran example ran successfully")
endif()
117 changes: 117 additions & 0 deletions examples/7_MPI/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# Example 7 - MPI

This example revisits the SimpleNet example and demonstrates how to run it using
MPI parallelism.


## Description

The Python file `simplenet.py` is copied from the earlier example. Recall that
it defines a very simple PyTorch network that takes an input of length 5 and
applies a single `Linear` layer to multiply it by 2.

The same `pt2ts.py` tool is used to save the simple network to TorchScript.

A series of files `mpi_infer_<LANG>` then bind from other languages to run the
TorchScript model in inference mode.

## Dependencies

To run this example requires:

- CMake
- An MPI installation.
- mpif90
- FTorch (installed as described in main package)
- Python 3

## Running

To run this example install FTorch as described in the main documentation. Then
from this directory create a virtual environment and install the necessary
Python modules:
```
python3 -m venv venv
source venv/bin/activate
pip install -r requirements.txt
```

You can check the network is set up correctly by running `simplenet.py`:
```
python3 simplenet.py
```
As before, this defines the network and runs it with an input tensor
[0.0, 1.0, 2.0, 3.0, 4.0] to produce the result:
```
tensor([[0, 2, 4, 6, 8]])
```

To save the `SimpleNet`` model to TorchScript run the modified version of the
`pt2ts.py` tool:
```
python3 pt2ts.py
```
which will generate `saved_simplenet_model_cpu.pt` - the TorchScript instance
of the network.

You can check that everything is working by running the `mpi_infer_python.py`
script. It's set up with MPI such that a different GPU device is associated
with each MPI rank. You should substitute `<NP>` with the number of GPUs you
wish to run with:
```
mpiexec -np <NP> python3 multigpu_infer_python.py
```
This reads the model in from the TorchScript file and runs it with an different
input tensor on each GPU device: [0.0, 1.0, 2.0, 3.0, 4.0], plus the device
index in each entry. Running with `NP=2`, the result should be (some
permutation of):
```
rank 0: result:
tensor([[0., 2., 4., 6., 8.]])
rank 1: result:
tensor([[ 2., 4., 6., 8., 10.]])
```

At this point we no longer require Python, so can deactivate the virtual
environment:
```
deactivate
```

To call the saved `SimpleNet` model from Fortran we need to compile the
`mpi_infer_fortran.f90` file. This can be done using the included
`CMakeLists.txt` as follows, noting that we need to use an MPI-enabled Fortran
compiler:
```
mkdir build
cd build
cmake .. -DCMAKE_PREFIX_PATH=<path/to/your/installation/of/library/> -DCMAKE_BUILD_TYPE=Release
cmake --build .
```

(Note that the Fortran compiler can be chosen explicitly with the
`-DCMAKE_Fortran_COMPILER` flag, and should match the compiler that was used to
locally build FTorch.)

To run the compiled code calling the saved `SimpleNet` TorchScript from Fortran,
run the executable with an argument of the saved model file. Again, specify the
number of MPI processes according to the desired number of GPUs:
```
mpiexec -np <NP> ./mpi_infer_fortran ../saved_simplenet_model_cpu.pt
```

This runs the model with the same inputs as described above and should produce (some
permutation of) the output:
```
input on rank 0: [ 0.0, 1.0, 2.0, 3.0, 4.0]
input on rank 1: [ 1.0, 2.0, 3.0, 4.0, 5.0]
output on rank 0: [ 0.0, 2.0, 4.0, 6.0, 8.0]
output on rank 1: [ 2.0, 4.0, 6.0, 8.0, 10.0]
```

Alternatively, we can use `make`, instead of CMake, copying the Makefile over from the
first example:
```
cp ../1_SimpleNet/Makefile .
```
See the instructions in that example directory for further details.
124 changes: 124 additions & 0 deletions examples/7_MPI/mpi_infer_fortran.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
program inference

! Import precision info from iso
use, intrinsic :: iso_fortran_env, only : sp => real32

! Import our library for interfacing with PyTorch
use ftorch, only : torch_model, torch_tensor, torch_kCPU, torch_delete, &
torch_tensor_from_array, torch_model_load, torch_model_forward

! Import our tools module for testing utils
use ftorch_test_utils, only : assert_allclose

! Import MPI
use mpi, only : mpi_comm_rank, mpi_comm_size, mpi_comm_world, mpi_finalize, mpi_float, &
mpi_gather, mpi_init

implicit none

! Set working precision for reals
integer, parameter :: wp = sp

integer :: num_args, ix
character(len=128), dimension(:), allocatable :: args

! Set up Fortran data structures
real(wp), dimension(5), target :: in_data
real(wp), dimension(5), target :: out_data
real(wp), dimension(5), target :: expected
integer, parameter :: tensor_layout(1) = [1]

! Set up Torch data structures
! The net, a vector of input tensors (in this case we only have one), and the output tensor
type(torch_model) :: model
type(torch_tensor), dimension(1) :: in_tensors
type(torch_tensor), dimension(1) :: out_tensors

! Flag for testing
logical :: test_pass

! MPI configuration
integer :: rank, size, ierr, i

! Variables for testing
real(wp), allocatable, dimension(:,:) :: recvbuf
real(wp), dimension(5) :: result_chk
integer :: rank_chk

call mpi_init(ierr)
call mpi_comm_rank(mpi_comm_world, rank, ierr)
call mpi_comm_size(mpi_comm_world, size, ierr)

! Check MPI was configured correctly
if (size == 1) then
write(*,*) "MPI communicator size is 1, indicating that it is not configured correctly"
write(*,*) "(assuming you specified more than one rank)"
call clean_up()
stop 999
end if

! Get TorchScript model file as a command line argument
num_args = command_argument_count()
allocate(args(num_args))
do ix = 1, num_args
call get_command_argument(ix,args(ix))
end do

! Initialise data and print the values used on each MPI rank
in_data = [(rank + i, i = 0, 4)]
write(unit=6, fmt="('input on rank ',i1,': ')", advance="no") rank
write(unit=6, fmt=100) in_data(:)
100 format('[',4(f5.1,','),f5.1,']')

! Create Torch input/output tensors from the above arrays
call torch_tensor_from_array(in_tensors(1), in_data, tensor_layout, torch_kCPU)
call torch_tensor_from_array(out_tensors(1), out_data, tensor_layout, torch_kCPU)

! Load ML model
call torch_model_load(model, args(1), torch_kCPU)

! Run inference on each MPI rank
call torch_model_forward(model, in_tensors, out_tensors)

! Print the values computed on each MPI rank
write(unit=6, fmt="('output on rank ',i1,': ')", advance="no") rank
write(unit=6, fmt=100) out_data(:)

! Gather the outputs onto rank 0
allocate(recvbuf(5,size))
call mpi_gather(out_data, 5, mpi_float, recvbuf, 5, mpi_float, 0, mpi_comm_world, ierr)

! Check that the correct values were attained
if (rank == 0) then

! Check output tensor matches expected value
do rank_chk = 0, size-1
expected = [(2 * (rank_chk + i), i = 0, 4)]
result_chk(:) = recvbuf(:,rank_chk+1)
test_pass = assert_allclose(result_chk, expected, test_name="MPI")
if (.not. test_pass) then
write(unit=6, fmt="('rank ',i1,' result: ')") rank_chk
write(unit=6, fmt=100) result_chk(:)
write(unit=6, fmt="('does not match expected value')")
write(unit=6, fmt=100) expected(:)
call clean_up()
stop 999
end if
end do

write (*,*) "MPI Fortran example ran successfully"
end if

call clean_up()

contains

subroutine clean_up()
call torch_delete(model)
call torch_delete(in_tensors)
call torch_delete(out_tensors)
call mpi_finalize(ierr)
deallocate(recvbuf)
end subroutine clean_up

end program inference
Loading