Skip to content

Commit

Permalink
fltk
Browse files Browse the repository at this point in the history
  • Loading branch information
bacox committed Apr 8, 2021
1 parent 9ffe896 commit c23981a
Show file tree
Hide file tree
Showing 53 changed files with 3,195 additions and 2 deletions.
9 changes: 9 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
venv
default_models
data_loaders
data/cifar-10-batches-py
data/cifar-100-python.tar.gz
data/FashionMNIST
data/cifar-100-python
data/cifar-10-python.tar.gz
simple_example
12 changes: 12 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,15 @@ dmypy.json

# Pyre type checker
.pyre/


venv
venv-*
default_models
data
data_loaders
simple_example
output
docker_data
.idea
*.tmp.txt
41 changes: 41 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Base image to start with
FROM ubuntu:20.04

# Who maintains this DockerFile
MAINTAINER Bart Cox <[email protected]>

# Run build without interactive dialogue
ARG DEBIAN_FRONTEND=noninteractive

ENV GLOO_SOCKET_IFNAME=eth0
ENV TP_SOCKET_IFNAME=eth0

# Define the working directory of the current Docker container
WORKDIR /opt/federation-lab

# Update the Ubuntu software repository
RUN apt-get update \
&& apt-get install -y vim curl python3 python3-pip net-tools iproute2

# Copy the current folder to the working directory
COPY setup.py ./

# Install all required packages for the generator
RUN pip3 setup.py install

#RUN mkdir -p ./data/MNIST
#COPY ./data/MNIST ../data/MNIST
ADD fltk ./fedsim
#RUN ls -la
COPY federated_learning.py ./
COPY custom_mnist.py ./
#RUN ls -la ./fedsim

# Expose the container's port to the host OS
EXPOSE 5000

# Run command by default for the executing container
# CMD ["python3", "/opt/Generatrix/rpc_parameter_server.py", "--world_size=2", "--rank=0", "--master_addr=192.168.144.2"]

#CMD python3 /opt/federation-lab/rpc_parameter_server.py --world_size=$WORLD_SIZE --rank=$RANK --master_addr=10.5.0.11
CMD python3 /opt/federation-lab/federated_learning.py $RANK $WORLD_SIZE 10.5.0.11
96 changes: 94 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,94 @@
# fltk
Federation Learning Toolkit
# FLTK - Federation Learning Toolkit
[![License](https://img.shields.io/badge/license-BSD-blue.svg)](LICENSE)

This toolkit is can be used to run Federated Learning experiments.
Pytorch Distributed ([docs](https://pytorch.org/tutorials/beginner/dist_overview.html)) is used in this project.
The goal if this project is to launch Federated Learning nodes in truly distribution fashion.

## Project structure

TBD

## Models

* Cifar10-CNN
* Cifar10-ResNet
* Cifar100-ResNet
* Cifar100-VGG
* Fashion-MNIST-CNN
* Fashion-MNIST-ResNet
* Reddit-LSTM

## Datasets

* Cifar10
* Cifar100
* Fashion-MNIST

## Prerequisites

When running in docker containers the following dependencies need to be installed:

* Docker
* Docker-compose

## Install
```bash
python3 setup.py install
```

[comment]: <> (```bash)

[comment]: <> (pip3 install -r ./requirements.txt)

[comment]: <> (```)

## Examples
<details><summary>Show Examples</summary>

<p>

### Single machine (Native)

#### Launch single client
Launch Federator
```bash
python3 -m fltk single configs/experiment.yaml --rank=0
```
Launch Client
```bash
python3 -m fltk single configs/experiment.yaml --rank=1
```

#### Spawn FL system
```bash
python3 -m fedsim spawn configs/experiment.yaml
```

### Two machines (Native)
To start a cross-machine FL system you have to configure the network interface connected to your network.
For example, if your machine is connected to the network via the wifi interface (for example with the name `wlo1`) this has to be configured as shown below:
```bash
os.environ['GLOO_SOCKET_IFNAME'] = 'wlo1'
os.environ['TP_SOCKET_IFNAME'] = 'wlo1'
```
Use `ifconfig` to find the name of the interface name on your machine.

### Docker Compose

```bash
docker-compose up
```

TBD

### Google Cloud Platform
TBD

</p>
</details>

## Known issues

* Currently, there is no GPU support. Not for native nor for docker compose
* First epoch only can be slow (6x - 8x slower)
16 changes: 16 additions & 0 deletions configs/experiment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
---
# Experiment configuration
total_epochs: 10
epochs_per_cycle: 1
wait_for_clients: true
net: Cifar10CNN
dataset: cifar10
experiment_prefix: 'experiment_multi_machine'
output_location: 'output'
tensor_board_active: true
clients_per_round: 2
system:
federator:
hostname: '192.168.0.129'
clients:
amount: 2
21 changes: 21 additions & 0 deletions deploy/templates/client_stub_default.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
client_name: # name can be anything
# container_name: federation-lab-client2 # what the name for this container would be
restart: "no" # if it crashes for example
build: . # look for the docker file where this file is currently located
volumes:
- ./docker_data:/opt/federation-lab/data
- ./default_models:/opt/federation-lab/default_models
- ./data_loaders:/opt/federation-lab/data_loaders
environment:
- PYTHONUNBUFFERED=1
- RANK={rank}
- WORLD_SIZE={world_size}
ports:
- "5002:5000" # {machine-port}:{docker-port}
depends_on:
- "fl_server"
deploy:
resources:
limits:
cpus: '1.25'
memory: 1024M
21 changes: 21 additions & 0 deletions deploy/templates/client_stub_medium.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
client_name: # name can be anything
# container_name: federation-lab-client2 # what the name for this container would be
restart: "no" # if it crashes for example
build: . # look for the docker file where this file is currently located
volumes:
- ./docker_data:/opt/federation-lab/data
- ./default_models:/opt/federation-lab/default_models
- ./data_loaders:/opt/federation-lab/data_loaders
environment:
- PYTHONUNBUFFERED=1
- RANK={rank}
- WORLD_SIZE={world_size}
ports:
- "5002:5000" # {machine-port}:{docker-port}
depends_on:
- "fl_server"
deploy:
resources:
limits:
cpus: '0.75'
memory: 1024M
21 changes: 21 additions & 0 deletions deploy/templates/client_stub_slow.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
client_name: # name can be anything
# container_name: federation-lab-client2 # what the name for this container would be
restart: "no" # if it crashes for example
build: . # look for the docker file where this file is currently located
volumes:
- ./docker_data:/opt/federation-lab/data
- ./default_models:/opt/federation-lab/default_models
- ./data_loaders:/opt/federation-lab/data_loaders
environment:
- PYTHONUNBUFFERED=1
- RANK={rank}
- WORLD_SIZE={world_size}
ports:
- "5002:5000" # {machine-port}:{docker-port}
depends_on:
- "fl_server"
deploy:
resources:
limits:
cpus: '0.5'
memory: 1024M
23 changes: 23 additions & 0 deletions deploy/templates/system_stub.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# creating a multi-container docker
version: "3.3"
services:
fl_server: # name can be anything
container_name: federation-lab-server # what the name for this container would be
restart: "no" # if it crashes for example
build: . # look for the docker file where this file is currently located
volumes:
# - ./data/MNIST:/opt/federation-lab/data/MNIST
- ./output:/opt/federation-lab/output
environment:
- PYTHONUNBUFFERED=1
- RANK=0
- WORLD_SIZE={world_size}
ports:
- "5000:5000" # {machine-port}:{docker-port}
networks:
default:
ipv4_address: 10.5.0.11
networks:
default:
external:
name: local_network_dev
2 changes: 2 additions & 0 deletions fltk/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

__version__ = '0.1.1'
39 changes: 39 additions & 0 deletions fltk/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import os
import sys
import torch.distributed.rpc as rpc
import logging

import yaml
import argparse

import torch.multiprocessing as mp
from fltk.federator import Federator
from fltk.launch import run_single, run_spawn
from fltk.util.base_config import BareConfig

logging.basicConfig(level=logging.DEBUG)

def main():
parser = argparse.ArgumentParser(description='Experiment launcher for the Federated Learning Testbed')
parser.add_argument('mode', choices=['single', 'spawn'])
parser.add_argument('config', type=str)
parser.add_argument('--rank', type=int)

args = parser.parse_args()
with open(args.config) as file:
cfg = BareConfig()
yaml_data = yaml.load(file, Loader=yaml.FullLoader)
cfg.merge_yaml(yaml_data)
if args.mode == 'single':
if args.rank is None:
print('Missing rank argument when in \'single\' mode!')
parser.print_help()
exit(1)
world_size = yaml_data['system']['clients']['amount'] + 1
master_address = yaml_data['system']['federator']['hostname']
run_single(rank=args.rank, world_size=world_size, host=master_address, args=cfg)
else:
run_spawn(cfg)

if __name__ == "__main__":
main()
Loading

0 comments on commit c23981a

Please sign in to comment.