Skip to content

Commit

Permalink
Merge updated FLTK into KFLTK
Browse files Browse the repository at this point in the history
  • Loading branch information
JMGaljaard committed Mar 30, 2022
2 parents 84c505a + ce1936a commit 4e493cf
Show file tree
Hide file tree
Showing 85 changed files with 5,778 additions and 387 deletions.
11 changes: 10 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
*.csv
*.json
*.png
*.pdf

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down Expand Up @@ -133,6 +138,10 @@ venv-*
data/**
!data/.gitkeep
output
docker_data
.idea
*.tmp.txt
docker-compose.yml

logging/**/events.out.**
refactor-notes.md
experiments/**/exps/*
5 changes: 4 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
FROM ubuntu:20.04

# Who maintains this DockerFile

MAINTAINER Jeroen Galjaard <[email protected]>

# Run build without interactive dialogue
ARG DEBIAN_FRONTEND=noninteractive

# ENV GLOO_SOCKET_IFNAME=eth0
# ENV TP_SOCKET_IFNAME=eth0

# Define the working directory of the current Docker container
WORKDIR /opt/federation-lab

Expand Down
26 changes: 25 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ Currently, it is assumed that Distributed Learning is performed (and *not* Feder
extension of the project is planned to implement a `FederatedClient` that allows for a more realistic simulation of
*Federated* Learning experiments.

### (Distributed Learning)

**General protocol:**

1. Client creation and spawning by the Orchestrator (using KubeFlows Pytorch-Operator)
Expand All @@ -38,8 +40,30 @@ extension of the project is planned to implement a `FederatedClient` that allows
* Data between clients (`WORLD_SIZE > 1`) is not shared
* Hardware can heterogeneous
* The location of devices matters (network latency and bandwidth)
* Communication is performed through RPC, aggregation is performed with `AllReduce`.

### Federated Learning
**General protocol:**

1. Client selection by the Federator.
2. The selected clients download the model.
3. Local training on the clients for X number of epochs
4. Weights/gradients of the trained model are send to the Federator
5. Federator aggregates the weights/gradients to create a new and improved model
6. Updated model is shared to the clients
7. Repeat step 1 to 6 until convergence/stopping condition.

**Important notes:**

* Data between clients is not shared to each other
* The data is non-IID
* Hardware can heterogeneous
* The location of devices matters (network latency and bandwidth)
* Communication can be costly




### Overview of deployed project
When deploying the system, the following diagram shows how the system operates. `PyTorchJob`s are launched by the
Orchestrator (see the [Orchestrator charts](./charts/orchestrator)). The Extractor keeps track of progress (see the
Expand Down Expand Up @@ -381,7 +405,7 @@ helm install flearner ./orchestrator --namespace test -f fltk-values.yaml
```

This will spawn an `fl-server` Pod in the `test` Namespace, which will spawn Pods (using `V1PyTorchJobs`), that
run experiments. It will currently make use of the [`configs/example_cloud_experiment.json`](./configs/example_cloud_experiment.json)
run experiments. It will currently make use of the [`configs/example_cloud_experiment.json`](configs/benchmarking/example_cloud_experiment.json)
default configuration. As described in the [values](./charts/orchestrator/values.yaml) file of the `Orchestrator`s Helm chart


Expand Down
2 changes: 1 addition & 1 deletion charts/orchestrator/values.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
orchestrator:
cpu: 1000m
memory: 2000000000
configurationFile: example_cloud_experiment.json
configurationFile: benchmarking/example_cloud_experiment.json
35 changes: 0 additions & 35 deletions configs/example_cloud_experiment.json

This file was deleted.

27 changes: 27 additions & 0 deletions deploy/docker/stub_default.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
client_name: # name can be anything
# container_name: federation-lab-client2 # what the name for this container would be
cpuset: '{cpu_set}'
restart: "no" # if it crashes for example
build: . # look for the docker file where this file is currently located
volumes:
- ./data:/opt/federation-lab/data
# - ./docker_data:/opt/federation-lab/data
- ./default_models:/opt/federation-lab/default_models
- ./data_loaders:/opt/federation-lab/data_loaders
- ./fltk:/opt/federation-lab/fltk
environment:
- PYTHONUNBUFFERED=1
- RANK={rank}
- WORLD_SIZE={world_size}
- EXP_CONFIG=${EXP_CONFIG_FILE}
- MASTER_HOSTNAME=10.5.0.11
- NIC=eth0
- OPTIONAL_PARAMS=${OPTIONAL_PARAMS}
ports:
- "5002:5000" # {machine-port}:{docker-port}
depends_on:
- "fl_server"
deploy:
resources:
limits:
cpus: '{num_cpus}'
30 changes: 30 additions & 0 deletions deploy/docker/system_stub.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# creating a multi-container docker
version: "3.3"
services:
fl_server: # name can be anything
container_name: federation-lab-server # what the name for this container would be
cpuset: '0-2'
restart: "no" # if it crashes for example
build: . # look for the docker file where this file is currently located
volumes:
# - ./data/MNIST:/opt/federation-lab/data/MNIST
- ./data:/opt/federation-lab/data
- ./output:/opt/federation-lab/output
- ./fltk:/opt/federation-lab/fltk
environment:
- PYTHONUNBUFFERED=1
- RANK=0
- WORLD_SIZE={world_size}
- EXP_CONFIG=${EXP_CONFIG_FILE}
- MASTER_HOSTNAME=10.5.0.11
- NIC=eth0
- OPTIONAL_PARAMS=${OPTIONAL_PARAMS}
ports:
- "5000:5000" # {machine-port}:{docker-port}
networks:
default:
ipv4_address: 10.5.0.11
networks:
default:
external:
name: local_network_dev
1 change: 1 addition & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
These examples are outdated!
25 changes: 25 additions & 0 deletions experiments/example_cuda/descr.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
---
# Experiment configuration
total_epochs: 3
rounds: 5
epochs_per_cycle: 1
wait_for_clients: true
net: MNISTCNN
dataset: mnist
# Use cuda is available; setting to false will force CPU
cuda: true
profiling_time: 100
warmup_round: false
output_location: 'output/example_cuda'
tensor_board_active: true
clients_per_round: 2
node_groups:
slow: [1, 1]
medium: [2, 2]
fast: [3, 3]
sampler: "uniform" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default)
sampler_args:
- 0.07 # label limit || q probability || alpha || unused
- 42 # random seed || random seed || random seed || unused
num_clients: 10
replications: 5
5 changes: 5 additions & 0 deletions experiments/example_cuda/fedavg.cfg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Individual configuration
offload_stategy: vanilla
deadline: 500
single_machine: true
real_time: false
48 changes: 48 additions & 0 deletions experiments/example_docker/descr.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
---
# Experiment configuration
total_epochs: 3
rounds: 5
epochs_per_cycle: 1
wait_for_clients: true
net: MNISTCNN
dataset: mnist
# Use cuda is available; setting to false will force CPU
cuda: false
profiling_time: 100
warmup_round: false
output_location: 'output/example_docker'
tensor_board_active: true
clients_per_round: 2
node_groups:
slow: [1, 1]
medium: [2, 2]
fast: [3, 3]
sampler: "uniform" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default)
sampler_args:
- 0.07 # label limit || q probability || alpha || unused
- 42 # random seed || random seed || random seed || unused
num_clients: 2
replications: 2
deploy:
docker:
base_path: deploy/docker
federator:
stub-name: system_stub.yml
pin-cores: true
num-cores: 1
clients:
fast:
stub-name: stub_default.yml
amount: 2
pin-cores: true
num-cores: 1
cpu-speed: 1
cpu-variation: 0
slow:
stub-name: stub_default.yml
amount: 0
pin-cores: true
num-cores: 1
cpu-speed: 1
cpu-variation: 0

5 changes: 5 additions & 0 deletions experiments/example_docker/fedavg.cfg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Individual configuration
offload_stategy: vanilla
deadline: 500
single_machine: false
real_time: true
25 changes: 25 additions & 0 deletions experiments/example_native/descr.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
---
# Experiment configuration
total_epochs: 3
rounds: 5
epochs_per_cycle: 1
wait_for_clients: true
net: MNISTCNN
dataset: mnist
# Use cuda is available; setting to false will force CPU
cuda: false
profiling_time: 100
warmup_round: false
output_location: 'output/example_native'
tensor_board_active: true
clients_per_round: 2
node_groups:
slow: [1, 1]
medium: [2, 2]
fast: [3, 3]
sampler: "uniform" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default)
sampler_args:
- 0.07 # label limit || q probability || alpha || unused
- 42 # random seed || random seed || random seed || unused
num_clients: 10
replications: 5
5 changes: 5 additions & 0 deletions experiments/example_native/fedavg.cfg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Individual configuration
offload_stategy: vanilla
deadline: 500
single_machine: true
real_time: false
2 changes: 1 addition & 1 deletion fltk/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@

__version__ = '0.3.1'
__version__ = '0.4.0'
Loading

0 comments on commit 4e493cf

Please sign in to comment.