Merge updated FLTK into KFLTK

JMGaljaard · Mar 30, 2022 · 4e493cf · 4e493cf
2 parents 84c505a + ce1936a
commit 4e493cf
Show file tree

Hide file tree

Showing 85 changed files with 5,778 additions and 387 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,8 @@
+*.csv
+*.json
+*.png
+*.pdf
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
@@ -133,6 +138,10 @@ venv-*
 data/**
 !data/.gitkeep
 output
+docker_data
 .idea
+*.tmp.txt
+docker-compose.yml
 
-logging/**/events.out.**
+refactor-notes.md
+experiments/**/exps/*
diff --git a/Dockerfile b/Dockerfile
@@ -1,11 +1,14 @@
 FROM ubuntu:20.04
 
-# Who maintains this DockerFile
+
 MAINTAINER Jeroen Galjaard <[email protected]>
 
 # Run build without interactive dialogue
 ARG DEBIAN_FRONTEND=noninteractive
 
+# ENV GLOO_SOCKET_IFNAME=eth0
+# ENV TP_SOCKET_IFNAME=eth0
+
 # Define the working directory of the current Docker container
 WORKDIR /opt/federation-lab
 

diff --git a/README.md b/README.md
@@ -24,6 +24,8 @@ Currently, it is assumed that Distributed Learning is performed (and *not* Feder
 extension of the project is planned to implement a `FederatedClient` that allows for a more realistic simulation of 
 *Federated* Learning experiments.
 
+### (Distributed Learning)
+
 **General protocol:**
 
 1. Client creation and spawning by the Orchestrator (using KubeFlows Pytorch-Operator)
@@ -38,8 +40,30 @@ extension of the project is planned to implement a `FederatedClient` that allows
 * Data between clients (`WORLD_SIZE > 1`) is not shared
 * Hardware can heterogeneous
 * The location of devices matters (network latency and bandwidth)
+* Communication is performed through RPC, aggregation is performed with `AllReduce`.
+
+### Federated Learning
+**General protocol:**
+
+1. Client selection by the Federator.
+2. The selected clients download the model.
+3. Local training on the clients for X number of epochs
+4. Weights/gradients of the trained model are send to the Federator
+5. Federator aggregates the weights/gradients to create a new and improved model
+6. Updated model is shared to the clients
+7. Repeat step 1 to 6 until convergence/stopping condition.
+
+**Important notes:**
+
+* Data between clients is not shared to each other
+* The data is non-IID
+* Hardware can heterogeneous
+* The location of devices matters (network latency and bandwidth)
 * Communication can be costly
 
+
+
+
 ### Overview of deployed project
 When deploying the system, the following diagram shows how the system operates. `PyTorchJob`s are launched by the 
 Orchestrator (see the [Orchestrator charts](./charts/orchestrator)). The Extractor keeps track of progress (see the 
@@ -381,7 +405,7 @@ helm install flearner ./orchestrator --namespace test -f fltk-values.yaml
 ```
 
 This will spawn an `fl-server` Pod in the `test` Namespace, which will spawn Pods (using `V1PyTorchJobs`), that
-run experiments. It will currently make use of the [`configs/example_cloud_experiment.json`](./configs/example_cloud_experiment.json)
+run experiments. It will currently make use of the [`configs/example_cloud_experiment.json`](configs/benchmarking/example_cloud_experiment.json)
 default configuration. As described in the [values](./charts/orchestrator/values.yaml) file of the `Orchestrator`s Helm chart
 
 

diff --git a/charts/orchestrator/values.yaml b/charts/orchestrator/values.yaml
@@ -1,4 +1,4 @@
 orchestrator:
     cpu: 1000m
     memory: 2000000000
-    configurationFile: example_cloud_experiment.json
+    configurationFile: benchmarking/example_cloud_experiment.json
diff --git a/configs/example_cloud_experiment.json b/configs/example_cloud_experiment.json
diff --git a/deploy/docker/stub_default.yml b/deploy/docker/stub_default.yml
@@ -0,0 +1,27 @@
+client_name: # name can be anything
+#    container_name: federation-lab-client2 # what the name for this container would be
+    cpuset: '{cpu_set}'
+    restart: "no" # if it crashes for example
+    build: . # look for the docker file where this file is currently located
+    volumes:
+      - ./data:/opt/federation-lab/data
+#      - ./docker_data:/opt/federation-lab/data
+      - ./default_models:/opt/federation-lab/default_models
+      - ./data_loaders:/opt/federation-lab/data_loaders
+      - ./fltk:/opt/federation-lab/fltk
+    environment:
+      - PYTHONUNBUFFERED=1
+      - RANK={rank}
+      - WORLD_SIZE={world_size}
+      - EXP_CONFIG=${EXP_CONFIG_FILE}
+      - MASTER_HOSTNAME=10.5.0.11
+      - NIC=eth0
+      - OPTIONAL_PARAMS=${OPTIONAL_PARAMS}
+    ports:
+      - "5002:5000" # {machine-port}:{docker-port}
+    depends_on:
+      - "fl_server"
+    deploy:
+      resources:
+        limits:
+          cpus: '{num_cpus}'
diff --git a/deploy/docker/system_stub.yml b/deploy/docker/system_stub.yml
@@ -0,0 +1,30 @@
+# creating a multi-container docker
+version: "3.3"
+services:
+  fl_server: # name can be anything
+    container_name: federation-lab-server # what the name for this container would be
+    cpuset: '0-2'
+    restart: "no" # if it crashes for example
+    build: . # look for the docker file where this file is currently located
+    volumes:
+#      - ./data/MNIST:/opt/federation-lab/data/MNIST
+      - ./data:/opt/federation-lab/data
+      - ./output:/opt/federation-lab/output
+      - ./fltk:/opt/federation-lab/fltk
+    environment:
+      - PYTHONUNBUFFERED=1
+      - RANK=0
+      - WORLD_SIZE={world_size}
+      - EXP_CONFIG=${EXP_CONFIG_FILE}
+      - MASTER_HOSTNAME=10.5.0.11
+      - NIC=eth0
+      - OPTIONAL_PARAMS=${OPTIONAL_PARAMS}
+    ports:
+      - "5000:5000" # {machine-port}:{docker-port}
+    networks:
+      default:
+        ipv4_address: 10.5.0.11
+networks:
+  default:
+    external:
+      name: local_network_dev
diff --git a/examples/README.md b/examples/README.md
@@ -0,0 +1 @@
+These examples are outdated!
diff --git a/experiments/example_cuda/descr.yaml b/experiments/example_cuda/descr.yaml
@@ -0,0 +1,25 @@
+---
+# Experiment configuration
+total_epochs: 3
+rounds: 5
+epochs_per_cycle: 1
+wait_for_clients: true
+net: MNISTCNN
+dataset: mnist
+# Use cuda is available; setting to false will force CPU
+cuda: true
+profiling_time: 100
+warmup_round: false
+output_location: 'output/example_cuda'
+tensor_board_active: true
+clients_per_round: 2
+node_groups:
+  slow: [1, 1]
+  medium: [2, 2]
+  fast: [3, 3]
+sampler: "uniform" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default)
+sampler_args:
+  - 0.07     # label limit || q probability || alpha || unused
+  - 42    # random seed || random seed || random seed || unused
+num_clients: 10
+replications: 5
diff --git a/experiments/example_cuda/fedavg.cfg.yaml b/experiments/example_cuda/fedavg.cfg.yaml
@@ -0,0 +1,5 @@
+# Individual configuration
+offload_stategy: vanilla
+deadline: 500
+single_machine: true
+real_time: false
diff --git a/experiments/example_docker/descr.yaml b/experiments/example_docker/descr.yaml
@@ -0,0 +1,48 @@
+---
+# Experiment configuration
+total_epochs: 3
+rounds: 5
+epochs_per_cycle: 1
+wait_for_clients: true
+net: MNISTCNN
+dataset: mnist
+# Use cuda is available; setting to false will force CPU
+cuda: false
+profiling_time: 100
+warmup_round: false
+output_location: 'output/example_docker'
+tensor_board_active: true
+clients_per_round: 2
+node_groups:
+  slow: [1, 1]
+  medium: [2, 2]
+  fast: [3, 3]
+sampler: "uniform" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default)
+sampler_args:
+  - 0.07     # label limit || q probability || alpha || unused
+  - 42    # random seed || random seed || random seed || unused
+num_clients: 2
+replications: 2
+deploy:
+  docker:
+    base_path: deploy/docker
+    federator:
+      stub-name: system_stub.yml
+      pin-cores: true
+      num-cores: 1
+    clients:
+      fast:
+        stub-name: stub_default.yml
+        amount: 2
+        pin-cores: true
+        num-cores: 1
+        cpu-speed: 1
+        cpu-variation: 0
+      slow:
+        stub-name: stub_default.yml
+        amount: 0
+        pin-cores: true
+        num-cores: 1
+        cpu-speed: 1
+        cpu-variation: 0
+
diff --git a/experiments/example_docker/fedavg.cfg.yaml b/experiments/example_docker/fedavg.cfg.yaml
@@ -0,0 +1,5 @@
+# Individual configuration
+offload_stategy: vanilla
+deadline: 500
+single_machine: false
+real_time: true
diff --git a/experiments/example_native/descr.yaml b/experiments/example_native/descr.yaml
@@ -0,0 +1,25 @@
+---
+# Experiment configuration
+total_epochs: 3
+rounds: 5
+epochs_per_cycle: 1
+wait_for_clients: true
+net: MNISTCNN
+dataset: mnist
+# Use cuda is available; setting to false will force CPU
+cuda: false
+profiling_time: 100
+warmup_round: false
+output_location: 'output/example_native'
+tensor_board_active: true
+clients_per_round: 2
+node_groups:
+  slow: [1, 1]
+  medium: [2, 2]
+  fast: [3, 3]
+sampler: "uniform" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default)
+sampler_args:
+  - 0.07     # label limit || q probability || alpha || unused
+  - 42    # random seed || random seed || random seed || unused
+num_clients: 10
+replications: 5
diff --git a/experiments/example_native/fedavg.cfg.yaml b/experiments/example_native/fedavg.cfg.yaml
@@ -0,0 +1,5 @@
+# Individual configuration
+offload_stategy: vanilla
+deadline: 500
+single_machine: true
+real_time: false
diff --git a/fltk/__init__.py b/fltk/__init__.py
@@ -1,2 +1,2 @@
 
-__version__ = '0.3.1'
+__version__ = '0.4.0'