diff --git a/.dockerignore b/.dockerignore index fa6a6105..0c60c0aa 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,9 +1,6 @@ -venv -default_models -data_loaders -data/cifar-10-batches-py -data/cifar-100-python.tar.gz -data/FashionMNIST -data/cifar-100-python -data/cifar-10-python.tar.gz -simple_example +# Ignoring the venv +venv/ +logging/ +# Ignoring all the compressed archives +**/*.tar.gz +**/__pycache__ diff --git a/.github/Bug_report.md b/.github/Bug_report.md new file mode 100644 index 00000000..73e6abb3 --- /dev/null +++ b/.github/Bug_report.md @@ -0,0 +1,44 @@ +--- +name: "\U0001F41B Bug Report" +about: "If something isn't working as expected \U0001F914." +title: '' +labels: 'i: bug, i: needs triage' +assignees: '' + +--- + +## Bug Report + +**Current Behavior** +A clear and concise description of the behavior. + +**Input Code** +- REPL or Repo link if applicable: + +```js +var your => (code) => here; +``` + +**Expected behavior/code** +A clear and concise description of what you expected to happen (or code). + +**FLTK Configuration (execution config, system parameters, hyper-parameters, etc.)** + +```js +{ + "your": { "config": "here" } +} +``` + +**Environment** +- Python version: [e.g. 3.7] +- PyTorch version: [e.g. 1.9.1] +- OS: [e.g. OSX 10.13.4, Windows 10] +- Kubernetes version: [e.g v1.22] +- Platform: [e.g. minikube, GKE, AWS] + +**Possible Solution** + + +**Additional context/Screenshots** +Add any other context about the problem here. If applicable, add screenshots to help explain. diff --git a/.github/Feature_request.md b/.github/Feature_request.md new file mode 100644 index 00000000..b9c82542 --- /dev/null +++ b/.github/Feature_request.md @@ -0,0 +1,23 @@ +--- +name: "\U0001F680 Feature Request" +about: "I have a suggestion (and may want to implement it \U0001F642)!" +title: '' +labels: 'i: enhancement, i: needs triage' +assignees: '' + +--- + +## Feature Request + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I have an issue when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. Add any considered drawbacks. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Teachability, Documentation, Adoption, Migration Strategy** +If you can, explain how users will be able to use this and possibly write out a version the docs. +Maybe a screenshot or design? diff --git a/.github/Pull_request.md b/.github/Pull_request.md new file mode 100644 index 00000000..381618e0 --- /dev/null +++ b/.github/Pull_request.md @@ -0,0 +1,20 @@ +## (Short) Description +This pull request addresses issue ... + +## Types of changes + +- [ ] Bug fix (non-breaking change which fixes an issue) +- [ ] New feature (non-breaking change which adds functionality) +- [ ] Breaking change (fix or feature that would cause existing functionality to change) +- [ ] I have read the **CONTRIBUTING** document. +- [ ] My code follows the code style of this project. +- [ ] My change requires a change to the documentation. +- [ ] I have updated the documentation accordingly. +- [ ] I have added tests to cover my changes. +- [ ] All new and existing tests passed. + + + +## (Optional) Additional remarks + + diff --git a/.github/Regression.md b/.github/Regression.md new file mode 100644 index 00000000..78af440d --- /dev/null +++ b/.github/Regression.md @@ -0,0 +1,42 @@ +--- +name: "\U0001F4A5 v7 Regression" +about: Report an unexpected behavior in v7 from v6 +title: '' +labels: 'i: bug, 7.x: regression, i: needs triage' +assignees: '' + +--- + +# Regression + +**Potential Commit/PR that introduced the regression** +If you have time to investigate, what PR/date introduced this issue. + +**Describe the regression** +A clear and concise description of what the regression is. + +**Input Code** + + +```js +var your => (code) => here; +``` + +**FLTK Configuration (execution config, system parameters, hyper-paramete +rs, etc.)** + +```js +{ + "your": { "config": "here" } +} +``` + +**Expected behavior/code** +A clear and concise description of what you expected to happen (or code). + +**Environment** +- Python version: [e.g. 3.7] +- PyTorch version: [e.g. 1.9.1] +- OS: [e.g. OSX 10.13.4, Windows 10] +- Kubernetes version: [e.g v1.22] +- Platform: [e.g. minikube, GKE, AWS] diff --git a/.github/Support_question.md b/.github/Support_question.md new file mode 100644 index 00000000..364bb346 --- /dev/null +++ b/.github/Support_question.md @@ -0,0 +1,14 @@ +--- +name: "\U0001F917 Support Question" +about: "If you have a question \U0001F4AC, please check out our the Mattermost or Brightspace page!" +title: '' +labels: 'i: question, i: needs triage' +assignees: '' + +--- + +--------------^ Click "Preview" for a nicer view! +We primarily use GitHub as an issue tracker; for usage and support questions, feel free to open an issue! + +--- + diff --git a/.gitignore b/.gitignore index 106b5275..c467fd2a 100644 --- a/.gitignore +++ b/.gitignore @@ -28,7 +28,7 @@ share/python-wheels/ MANIFEST # PyInstaller -# Usually these files are written by a python script from a template +# Usually these files are written by a python script from a master_template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec @@ -128,15 +128,11 @@ dmypy.json # Pyre type checker .pyre/ - venv venv-* -default_models -data -data_loaders -simple_example +data/** +!data/.gitkeep output -docker_data .idea -*.tmp.txt -docker-compose.yml \ No newline at end of file + +logging/**/events.out.** \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index b42cd68c..6d542913 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,41 +1,26 @@ -# Base image to start with FROM ubuntu:20.04 - + # Who maintains this DockerFile -MAINTAINER Bart Cox +MAINTAINER Jeroen Galjaard # Run build without interactive dialogue ARG DEBIAN_FRONTEND=noninteractive -ENV GLOO_SOCKET_IFNAME=eth0 -ENV TP_SOCKET_IFNAME=eth0 - # Define the working directory of the current Docker container WORKDIR /opt/federation-lab -# Update the Ubuntu software repository +# Update the Ubuntu software repository and fetch packages RUN apt-get update \ - && apt-get install -y vim curl python3 python3-pip net-tools iproute2 - -# Copy the current folder to the working directory -COPY setup.py ./ - -# Install all required packages for the generator -RUN pip3 setup.py install - -#RUN mkdir -p ./data/MNIST -#COPY ./data/MNIST ../data/MNIST -ADD fltk ./fedsim -#RUN ls -la -COPY federated_learning.py ./ -COPY custom_mnist.py ./ -#RUN ls -la ./fedsim + && apt-get install -y curl python3 python3-pip net-tools iproute2 -# Expose the container's port to the host OS -EXPOSE 5000 +# Add Pre-downloaded models (otherwise needs be run every-time) +ADD data/ data/ -# Run command by default for the executing container -# CMD ["python3", "/opt/Generatrix/rpc_parameter_server.py", "--world_size=2", "--rank=0", "--master_addr=192.168.144.2"] +# Use cache for pip, otherwise we repeatedly pull from repository +ADD requirements.txt ./ +RUN --mount=type=cache,target=/root/.cache/pip python3 -m pip install -r requirements.txt -#CMD python3 /opt/federation-lab/rpc_parameter_server.py --world_size=$WORLD_SIZE --rank=$RANK --master_addr=10.5.0.11 -CMD python3 /opt/federation-lab/federated_learning.py $RANK $WORLD_SIZE 10.5.0.11 \ No newline at end of file +# Add FLTK and configurations +ADD fltk fltk +ADD configs configs +ADD charts charts diff --git a/README.md b/README.md index cf9a287a..810621d1 100644 --- a/README.md +++ b/README.md @@ -1,65 +1,85 @@ -# FLTK - Federation Learning Toolkit +# Kubernetes - Federation Learning Toolkit ((K)FLTK) [![License](https://img.shields.io/badge/license-BSD-blue.svg)](LICENSE) [![Python 3.6](https://img.shields.io/badge/python-3.7-blue.svg)](https://www.python.org/downloads/release/python-370/) [![Python 3.6](https://img.shields.io/badge/python-3.8-blue.svg)](https://www.python.org/downloads/release/python-380/) +[![Python 3.6](https://img.shields.io/badge/python-3.9-blue.svg)](https://www.python.org/downloads/release/python-390/) -This toolkit is can be used to run Federated Learning experiments. -Pytorch Distributed ([docs](https://pytorch.org/tutorials/beginner/dist_overview.html)) is used in this project. -The goal if this project is to launch Federated Learning nodes in truly distribution fashion. +This toolkit can be used to run Distributed and Federated experiments. This project makes use of +Pytorch Distributed (Data Parallel) ([docs](https://pytorch.org/tutorials/beginner/dist_overview.html)) +as well as Kubernetes, KubeFlow (Pytorch-Operator) ([docs](https://www.kubeflow.org/docs/)) and Helm ([docs](https://helm.sh/docs)) for deployment. +The goal of this project is to launch Federated Learning nodes in a true distribution fashion, with simple deployments +using proven technology. -This project is tested with Ubuntu 20.04 and python {3.7, 3.8}. -### Global idea -Pytorch distributed works based on a world_size and ranks. The ranks should be between 0 and world_size-1. -Generally, the federator has rank 0 and the clients have ranks between 1 and world_size-1. +This project builds on the work by Bart Cox, on the Federated Learning toolkit developed to run with Docker and +Docker Compose ([repo](https://github.com/bacox/fltk)) -General protocol: -1. Client selection by the federator -2. The selected clients download the model. -2. Local training on the clients for X number of epochs -3. Weights/gradients of the trained model are send to the federator -4. Federator aggregates the weights/gradients to create a new and improved model -5. Updated model is shared to the clients -6. Repeat step 1 to 5 until convergence +This project is tested with Ubuntu 20.04 and Arch Linux and Python {3.7, 3.8, 3.9}. -Important notes: +## Global idea +Pytorch Distributed works based on a `world_size` and `rank`s. The ranks should be between `0` and `world_size-1`. +Generally, the process leading the learning process has rank `0` and the clients have ranks `[1,..., world_size-1]`. -* Data between clients is not shared to each other -* The data is non-IID +Currently, it is assumed that Distributed Learning is performed (and *not* Federated Learning), however, future +extension of the project is planned to implement a `FederatedClient` that allows for a more realistic simulation of +*Federated* Learning experiments. + +**General protocol:** + +1. Client creation and spawning by the Orchestrator (using KubeFlows Pytorch-Operator) +2. Clients prepare needed data and model and synchronize using PyTorch Distributed. + 1. `WORLD_SIZE = 1`: Client performs training locally. + 2. `WORLD_SIZE > 1`: Clients run epochs with DistributedDataParallel together. + 3. (FUTURE: ) Your federated learning experiment. +3. Client logs/reports progress during and after training. + +**Important notes:** + +* Data between clients (`WORLD_SIZE > 1`) is not shared * Hardware can heterogeneous * The location of devices matters (network latency and bandwidth) * Communication can be costly +### Overview of deployed project +When deploying the system, the following diagram shows how the system operates. `PyTorchJob`s are launched by the +Orchestrator (see the [Orchestrator charts](./charts/orchestrator)). The Extractor keeps track of progress (see the +[Extractor charts](./charts/extractor)). + +The `PyTorchJob`s can consist on a variable number of machines, with different hardware for the Master/Leader node and the +Client nodes. KubeFlow (not depicted) orchestrates the deployment of the `PyTorchJob`s. + +![Overview of deployment](https://lucid.app/publicSegments/view/027793d8-a059-4c45-a030-660a492a4c0a/image.png) +## Something is broken/missing + +It might be that something is missing, please open a pull request/issue). + ## Project structure Structure with important folders and files explained: + ``` project -├── configs -│ └── experiment.yaml # Example of an experiment configuration -├── deploy # Templates for automatic deployment -│ └── templates -│ ├── client_stub_default.yml -│ ├── client_stub_medium.yml -│ ├── client_stub_slow.yml -│ └── system_stub.yml # Describes the federator and the network -├── fltk # Source code -│ ├── datasets # Different dataset definitions -│ │ ├── data_distribution # Datasets with distributed sampler -│ │ └── distributed # "regular" datasets for centralized use -│ ├── nets # Available networks -│ ├── schedulers # Learning Rate Schedulers -│ ├── strategy # Client selection and model aggregation algorithms -│ └── util -│ └── generate_docker_compose.py # Generates a docker-compose.yml for a containerized run -├── Dockerfile # Dockerfile to run in containers -├── LICENSE -├── README.md -└── setup.py +├── charts # Templates for deploying projects with Helm +│ ├── extractor - Template for 'extractor' for centralized logging (using NFS) +│ └── orchestrator - Template for 'orchestrator' for launching distributed experiments +├── configs # General configuration files +│ ├── quantities - Files for (Kubernetes) quantity conversion +│ └── tasks - Files for experiment description +├── data # Directory for default datasets (for a reduced load on hosting providers) +├── fltk # Source code +│ ├── datasets - Datasets (by default provide Pytorchs Distributed Data-Parallel) +│ ├── nets - Default models +│ ├── schedulers - Learningrate schedulers +│ ├── strategy - (Future) Basic strategies for Federated Learning experiments +│ └── util - Helper utilities +│ ├── cluster * Cluster interaction (Job Creation and monitoring) +│ ├── config * Configuration file loading +│ └── task * Arrival/TrainTask generation +└── logging # Default logging location ``` ## Models -* Cifar10-CNN +* Cifar10-CNN (CIFAR10CNN) * Cifar10-ResNet * Cifar100-ResNet * Cifar100-VGG @@ -69,75 +89,326 @@ project ## Datasets -* Cifar10 +* CIFAR10 * Cifar100 * Fashion-MNIST +* MNIST + +## Pre-requisites + +The following tools need to be set up in your development environment before working with the (Kubernetes) FLTK. -## Prerequisites +* Hard requirements + * Docker ([docs](https://www.docker.com/get-started)) (with support for BuildKit [docs](https://docs.docker.com/develop/develop-images/build_enhancements/)) + * Kubectl ([docs](https://kubernetes.io/docs/setup/)) + * Helm ([docs](https://helm.sh/docs/chart_template_guide/getting_started/)) + * Kustomize (3.2.0) ([docs](https://kubectl.docs.kubernetes.io/installation/kustomize/)) +* Local execution (single machine): + * MiniKube ([docs](https://minikube.sigs.k8s.io/docs/start/)) + * It must be noted that certain functionality might require additional steps to work on MiniKube. This is currently untested. +* Google Cloud Environment (GKE) execution: + * GCloud SDK ([docs](https://cloud.google.com/sdk/docs/quickstart)) +* Your own cluster provider: + * A Kubernetes cluster supporting Kubernetes 1.16+. -When running in docker containers the following dependencies need to be installed: +## Getting started -* Docker -* Docker-compose +Before continuing a deployment, first, the used datasets need to be downloaded. This is done to prevent the need for +downloading each dataset for each container. Per default, these models are included in the Docker container that gets +deployed on a Kubernetes Cluster. + +### Download datasets +To download the models, execute the following command from the [project root](.). -## Install ```bash -python3 setup.py install +python3 -m fltk extractor ./configs/example_cloud_experiment.json +``` + +## Deployment + +This deployment guide will provide the general process of deploying an example deployment on +the created cluster. It is assumed that you have already set up a cluster (or emulation tool like MiniKube to execute the +commands locally). + +**N.B.** This setup expects the NodePool on which you **want** to run training experiments, to have **Taints**, +this should be set for the selected nodes. For more information on GKE see [docs](https://cloud.google.com/kubernetes-engine/docs/how-to/node-taints). + +In this project we assume the following taint to be set, this can also be done using `kubectl` for each node. + +``` +fltk.node=normal:NoSchedule +``` + +Programmatically, the following `V1Toleration` allows pods to be scheduled on such 'tainted' nodes, regardless of the value +for `fltk.node`. +```python +from kubernetes.client import V1Toleration + +V1Toleration(key="fltk.node", + operator="Exists", + effect="NoSchedule") +``` + +For a more strict Toleration (specific to a value), the following `V1Toleration` should be generated. + +```python +V1Toleration(key="fltk.node", + operator="Equals", + value="normal", + effect='NoSchedule') ``` -### Load models +For more information on the programmatic creation of `PyTorchJobs` to spawn on a cluster, refer to the +`DeploymentBuilder` found in [`./fltk/util/cluster/client.py`](./fltk/util/cluster/client.py) and the function +`construct_job`. + +### GKE / MiniKube +Currently, this guide was tested to result in a working FLTK setup on GKE and MiniKube. + +The guide is structured as follows: + +1. (Optional) Setup a Kubernetes Dashboard instance for monitoring +2. Install KubeFlow's Pytorch-Operator (in a bare minimum configuration). + * KubeFlow is used to create and manage Training jobs for Pytorch Training jobs. However, you can also + extend the work by making use of KubeFlows TF-Operator, to make use of Tensorflow. +3. (Optional) Deploy KubeFlow PyTorch Job using an example project. +4. Install an NFS server. + * To simplify FLTK's deployment, an NFS server is used to allow for the creation of `ReadWriteMany` volumes in Kubernetes. + These volumes are, for example, used to create a centralized logging point, that allows for easy extraction of data + from the `Extractor` pod. +5. Setup and install the `Extractor` pod. + * The `Extractor` pod is used to create the required volume claims, as well as create a single access point to gain + insight into the training process. Currently, it spawns a pod that runs the a `Tensorboard` instance, as a + `SummaryWriter` is used to record progress in a `Tensorboard` format. These are written to a `ReadWriteMany` mounted + on a pods `$WORKING_DIR/logging` by default during execution. +6. Deploy a default FLTK experiment. + +### (Optional) setup Kubernetes Dashboard +Kubernetes Dashboard provides a comprehensive interface into some metrics, logs and status information of your cluster +and the deployments it's running. To setup this dashboard, Helm can be used as follows: + + +```bash +helm repo add kubernetes-dashboard https://kubernetes.github.io/dashboard/ +helm install kubernetes-dashboard kubernetes-dashboard/kubernetes-dashboard +``` + +After setup completes, running the following commands (in case you change the release name to something different, you can +fetch the command using `helm status your-release-name --namespace optional-namespace-name`) to connect to your Kubernetes +Dashboard. +```bash +export POD_NAME=$(kubectl get pods -n default -l "app.kubernetes.io/name=kubernetes-dashboard,app.kubernetes.io/instance=kubernetes-dashboard" -o jsonpath="{.items[0].metadata.name}") +kubectl -n default port-forward $POD_NAME 8443:8443 +``` + +Then browsing to [https://localhost:8443](https://localhost:8443) on your machine will connect you to the Dashboard instance. +Note that the certificate is self-signed of the Kubernetes Dashboard, so your browser may give warnings that the site is +unsafe. + +### Installing KubeFlow + PyTorch-Operator +Kubeflow is an ML toolkit that allows to for a wide range of distributed machine and deep learning operations on Kubernetes clusters. +FLTK makes use of the 1.3 release. We will deploy a minimal configuration, following the documentation of KubeFlows +[manifests repository](https://github.com/kubeflow/manifests). If you have already setup KubeFlow (and PyTorch-Operator) +you may want to skip this step. + ```bash -python3 fltk/util/default_models.py +git clone https://github.com/kubeflow/manifests.git --branch=v1.3-branch +cd manifests ``` -## Examples -
Show Examples +You might want to read the `README.md` file for more information. Using Kustomize, we will install the default configuration +files for each KubeFlow component that is needed for a minimal setup. If you have already worked with KubeFlow on GKE +you might want to follow the GKE deployment on the official KubeFlow documentation. This will, however, result in a slightly +higher strain on your cluster, as more components will be installed. -

-### Single machine (Native) +#### Setup cert-manager +```bash +kustomize build common/cert-manager/cert-manager/base | kubectl apply -f - +# Wait before executing the following command, as +kustomize build common/cert-manager/kubeflow-issuer/base | kubectl apply -f - +``` -#### Launch single client -Launch Federator +#### Setup Isto ```bash -python3 -m fltk single configs/experiment.yaml --rank=0 +kustomize build common/istio-1-9/istio-crds/base | kubectl apply -f - +kustomize build common/istio-1-9/istio-namespace/base | kubectl apply -f - +kustomize build common/istio-1-9/istio-install/base | kubectl apply -f - ``` -Launch Client + +```bash +kustomize build common/dex/overlays/istio | kubectl apply -f - +``` + +```bash +kustomize build common/oidc-authservice/base | kubectl apply -f - +``` + +#### Setup knative +```bash + +kustomize build common/knative/knative-serving/base | kubectl apply -f - +kustomize build common/istio-1-9/cluster-local-gateway/base | kubectl apply -f - +``` + +#### Setup KubeFlow +```bash +kustomize build common/kubeflow-namespace/base | kubectl apply -f - +``` + ```bash -python3 -m fltk single configs/experiment.yaml --rank=1 +kustomize build common/kubeflow-roles/base | kubectl apply -f - +kustomize build common/istio-1-9/kubeflow-istio-resources/base | kubectl apply -f - ``` -#### Spawn FL system +#### Setup PyTorch-Operator ```bash -python3 -m fltk spawn configs/experiment.yaml +kustomize build apps/pytorch-job/upstream/overlays/kubeflow | kubectl apply -f - ``` -### Two machines (Native) -To start a cross-machine FL system you have to configure the network interface connected to your network. -For example, if your machine is connected to the network via the wifi interface (for example with the name `wlo1`) this has to be configured as shown below: +### (Optional) Testing KubeFlow deployment + +In case you want to test your KubeFlow deployment, an example training job can be run. For this, an example project of +the pytorch-operator [repository](https://github.com/kubeflow/pytorch-operator/) can be used. + +```bash +git checkout https://github.com/kubeflow/pytorch-operator.git +cd pytorch-operator/examples/mnist +``` + +Follow the `README.md` instructions, and make sure to *rename* the image name in `pytorch-operator/examples/mnist/v1/pytorch_job_mnist_gloo.yaml` +(line 33 and 35), to your project on GCE. Also commend out the `resource` descriptions in lines 20-22 and 36-38. Otherwise +jobs require GPU support to run. + +Build and push the Docker container, and execute the command to launch your first PyTorchJob on your cluster. + +```bash +kubectl create -f ./v1/pytorch_job_mnist_gloo.yaml +``` + +### Create experiment Namespace +Create your namespace in your cluster, that will later be used to deploy experiments. This guide (and the default +setup of the project) assumes that the namespace `test` is used. To create a namespace, run the following command with your cluster credentials set up before running these commands. + +```bash +kubectl namespace create test +``` + +### Installing NFS +During the execution, `ReadWriteMany` persistent volumes are needed. This is because each training processes master +pod uses a`SummaryWriter` to log the training progress. As such, multiple containers on potentially different nodes require +read-write access to a single volume. One way to resolve this is to make use of Google Firestore (or +equivalent on your service provider of choice). However, this will incur significant operating costs, as operation starts at 1 TiB (~200 USD per month). As such, we will deploy our own a NFS on our cluster. + +In case this does not need your scalability requirements, you may want to set up a (sharded) CouchDB instance, and use +that as a data store. This is not provided in this guide. + + +For FLTK, we make use of the `nfs-server-provisioner` Helm chart created by `kvaps`, which neatly wraps this functionality in an easy +to deploy chart. Make sure to install the NFS server in the same *namespace* as where you want to run your experiments. + +Running the following commands will deploy a `nfs-server` instance (named `nfs-server`) with the default configuration. +In addition, it creates a Persistent Volume of `20 Gi`, allowing for `20 Gi` `ReadWriteMany` persistent volume claims. +You may want to change this amount, depending on your need. Other service providers, such as DigitalOcean, might require the +`storageClass` to be set to `do-block-storage` instead of `default`. + +```bash +helm repo add kvaps https://kvaps.github.io/charts +helm update +helm install nfs-server kvaps/nfs-server-provisioner --namespace test --set persistence.enabled=true,persistence.storageClass=standard,persistence.size=20Gi +``` + +To create a Persistent Volume (for a Persistent Volume Claim), the following syntax should be used, similar to the Persistent +Volume description provided in [./charts/extractor/templates/fl-log-claim-persistentvolumeclaim.yaml](./charts/extractor/templates/fl-log-claim-persistentvolumeclaim.yaml). +Which creates a Persistent Volume that uses the values provided in [./charts/fltk-values.yaml](./charts/fltk-values.yaml). + + +**N.B.** If you wish to use a Volume as both **ReadWriteOnce** and **ReadOnlyMany**, GCE does **NOT** provide this functionality +You'll need to either create a **ReadWriteMany** Volume with read-only Claims, or ensure that the writer completes before +the readers are spawned (and thus allowing for **ReadWriteOnce** to be allowed during deployment). For more information +consult the Kubernetes and GKE Kubernetes + +### Creating and pushing Docker containers +On your remote cluster, you need to have set up a docker registry. For example, Google provides the Google Container Registry +(GCR). In this example, we will make use of GCR, to push our container to a project `test-bed-distml` under the tag `fltk`. + +This requires you to have enabled the GCR in your GCE project beforehand. Make sure that your docker installation supports +Docker Buildkit, or remove the `DOCKER_BUILDKIT=1` part from the command before running (this might require additional changes +in the Dockerfile). + +```bash +DOCKER_BUILDKIT=1 docker build . --tag gcr.io/test-bed-distml/fltk +docker push gcr.io/test-bed-distml/fltk +``` + +**N.B.** when running in Minikube, you can also set up a local registry. An example of how this can be quickly achieved +can be found [in this Medium post by Shashank Srivastava](https://shashanksrivastava.medium.com/how-to-set-up-minikube-to-use-your-local-docker-registry-10a5b564883). + + +### Setting up the Extractor + +This section only needs to be run once, as this will set up the TensorBoard service, as well as create the Volumes needed +for the deployment of the `Orchestrator`'s chart. It does, however, require you to have pushed the docker container to a +registry that can be accessed from your Cluster. + +**N.B.** that removing the `Extractor` chart will result in the deletion of the Persistent Volumes once all Claims are +released. This **will remove** the data that is stored on these volumes. **Make sure to copy** the contents of these directories to your local file system before uninstalling the `Extractor` Helm chart. The following commands deploy the `Extractor` +Helm chart, under the name `extractor` in the `test` namespace. +```bash +cd charts +helm install extractor -f values.yaml --namespace test +``` + +And wait for it to deploy. (Check with `helm ls --namespace test`) + +**N.B.** To download data from the `Extrator` node (which mounts the logging director), the following `kubectl` + command can be used. This will download the data in the logging directory to your file system. Note that downloading +many small files is slow (as they will be compressed individually). The command assumes that the default name is used +`fl-extractor`. + +```bash +kubectl cp --namespace test fl-extractor:/opt/federation-lab/logging ./logging +``` + +### Launching an experiment +We have now completed the setup of the project and can continue by running actual experiments. If no errors occur, this +should. You may also skip this step and work on your code, but it might be good to test your deployment +before running into trouble later. + +```bash +cd charts +helm install flearner ./orchestrator --namespace test -f fltk-values.yaml +``` + +This will spawn an `fl-server` Pod in the `test` Namespace, which will spawn Pods (using `V1PyTorchJobs`), that +run experiments. It will currently make use of the [`configs/example_cloud_experiment.json`](./configs/example_cloud_experiment.json) +default configuration. As described in the [values](./charts/orchestrator/values.yaml) file of the `Orchestrator`s Helm chart + + +## Running tests +In addition to the FLTK framework implementation, some tests are available to prevent regression of bugs. Currently, only a limited subset of features +is tested of FLTK. All current tests are deterministic, flaky tests indicate that something is likely broken + +### Prerequisites +Setup a `development` virtual environment, using the [`requirements-dev.txt`](requirements-dev.txt) requirements file. +This will install the same requirements as the [`requirements.txt`](requirements.txt), with some additional packages needed to run the tests. + +```bash +python3 -m venv venv-dev +source venv-dev/bin/activate +pip install -r requirements.txt +``` + +### Executing tests +Make sure to run in a shell with the `venv-dev` virtual environment. With the environment enabled, we can run using: + ```bash -os.environ['GLOO_SOCKET_IFNAME'] = 'wlo1' -os.environ['TP_SOCKET_IFNAME'] = 'wlo1' +python3 -m pytest -v ``` -Use `ifconfig` to find the name of the interface name on your machine. -### Docker Compose -1. Make sure docker and docker-compose are installed. -2. Generate a `docker-compose.yml` file for your experiment. You can use the script `generate_docker_compose.py` for this. - From the root folder: ```python3 fltk/util/generate_docker_compose.py 4``` to generate a system with 4 clients. - Feel free to change/extend `generate_docker_compose.py` for your own need. - A `docker-compose.yml` file is created in the root folder. -3. Run docker-compose to start the system: - ```bash - docker-compose up - ``` -### Google Cloud Platform -See Manual on brightspace +Which will collect and run all the tests in the repository, and show in `verbose` which tests passed. -

-
-## Known issues +## Known issues / Limitations -* Currently, there is no GPU support docker containers (or docker compose) -* First epoch only can be slow (6x - 8x slower) \ No newline at end of file +* Currently, there is no GPU support in the Docker containers. diff --git a/charts/Chart.yaml b/charts/Chart.yaml new file mode 100644 index 00000000..dbf9f93c --- /dev/null +++ b/charts/Chart.yaml @@ -0,0 +1,9 @@ +name: ../docker-compose-gcloud +description: Helm charts for the FLTK framework on Kubernetes. +version: 0.0.1 +apiVersion: v1 +appVersion: 1.16.0 +keywords: + - ../docker-compose-gcloud +sources: +home: diff --git a/charts/README.md b/charts/README.md new file mode 100644 index 00000000..86452ead --- /dev/null +++ b/charts/README.md @@ -0,0 +1,2 @@ +# FLTK Helm charts + diff --git a/charts/extractor/Chart.yaml b/charts/extractor/Chart.yaml new file mode 100644 index 00000000..3e64b092 --- /dev/null +++ b/charts/extractor/Chart.yaml @@ -0,0 +1,10 @@ +name: fltk-extractor +description: Helm Chart for running the Extractor for the FLTK framework +version: 0.1.0 +apiVersion: v1 +appVersion: 1.17.0 +keywords: + - extractor + - FLTK +sources: +home: diff --git a/charts/extractor/templates/NOTES.txt b/charts/extractor/templates/NOTES.txt new file mode 100644 index 00000000..55f6b4d8 --- /dev/null +++ b/charts/extractor/templates/NOTES.txt @@ -0,0 +1,5 @@ +Get the FLTK extractors Tensorboard URL by running: + +export POD_NAME=$(kubectl get pods -n {{ .Release.Namespace }} -l "app.kubernetes.io/name=fltk.extractor" -o jsonpath="{.items[0].metadata.name}") +echo http://127.0.0.1:8443/ +kubectl -n {{ .Release.Namespace }} port-forward $POD_NAME 6006:6006 diff --git a/charts/extractor/templates/fl-extractor-pod.yaml b/charts/extractor/templates/fl-extractor-pod.yaml new file mode 100644 index 00000000..21bc5a4d --- /dev/null +++ b/charts/extractor/templates/fl-extractor-pod.yaml @@ -0,0 +1,54 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: fl-extractor +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: "fltk.extractor" + app.kubernetes.io/instance: {{ .Release.Name }} + template: + metadata: + labels: + app.kubernetes.io/name: "fltk.extractor" + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + fltk.service: fl-extractor + spec: + containers: + - name: federation-lab-server + command: + - tensorboard + args: + - --logdir + - logging + image: {{ .Values.provider.domain }}/{{ .Values.provider.projectName }}/{{ .Values.provider.imageName }} + ports: + - containerPort: 6006 + resources: {} + # Set the following values in case you want to limit the reserved resources. + # Whenever the pod exceeds these requirements, the Scheduler (Kubernetes) may kill + # the pod, resulting in startup. + # requests: {} + # limits: {} + volumeMounts: + - mountPath: /opt/federation-lab/output + name: fl-server-claim + readOnly: true # We mount output readOnly as this is for Orchestrator data. + - mountPath: /opt/federation-lab/logging + name: fl-log-claim + readOnly: true # We mount log readOnly as this is for the Master worker nodes. + lifecycle: + # This pre-stop hook + preStop: + exec: + command: + - "killall -9 tensorboard" + volumes: + - name: fl-server-claim + persistentVolumeClaim: + claimName: fl-server-claim + - name: fl-log-claim + persistentVolumeClaim: + claimName: fl-log-claim diff --git a/charts/extractor/templates/fl-log-claim-persistentvolumeclaim.yaml b/charts/extractor/templates/fl-log-claim-persistentvolumeclaim.yaml new file mode 100644 index 00000000..d663929a --- /dev/null +++ b/charts/extractor/templates/fl-log-claim-persistentvolumeclaim.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + labels: + io.fltk.service: fl-log-claim + name: fl-log-claim +spec: + # Only the Extractor pod should be able to write to the part. Other pods 'simply' use it as read-only + accessModes: + - ReadWriteMany + resources: + requests: + storage: {{ .Values.extractor.logging.size }} + storageClassName: "nfs" diff --git a/charts/extractor/templates/fl-server-claim-persistentvolumeclaim.yaml b/charts/extractor/templates/fl-server-claim-persistentvolumeclaim.yaml new file mode 100644 index 00000000..1a560181 --- /dev/null +++ b/charts/extractor/templates/fl-server-claim-persistentvolumeclaim.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + labels: + io.fltk.service: fl-server-claim + name: fl-server-claim +spec: + # Only the Orchestrator pod should be able to write to the part. Extractor functions as a simple 'read' to + # Make data collection easier. + accessModes: + - ReadWriteMany + resources: + requests: + storage: {{ .Values.extractor.output.size }} + storageClassName: "nfs" diff --git a/charts/extractor/values.yaml b/charts/extractor/values.yaml new file mode 100644 index 00000000..a1c8c10e --- /dev/null +++ b/charts/extractor/values.yaml @@ -0,0 +1,5 @@ +extractor: + logging: + size: 2Gi + output: + size: 5Gi \ No newline at end of file diff --git a/charts/fltk-values.yaml b/charts/fltk-values.yaml new file mode 100644 index 00000000..70eb3908 --- /dev/null +++ b/charts/fltk-values.yaml @@ -0,0 +1,6 @@ +fltk: + config: cloud_experiment.yaml +provider: + domain: gcr.io + projectName: test-bed-distml + imageName: fltk:latest diff --git a/charts/orchestrator/Chart.yaml b/charts/orchestrator/Chart.yaml new file mode 100644 index 00000000..3d40d0dc --- /dev/null +++ b/charts/orchestrator/Chart.yaml @@ -0,0 +1,10 @@ +name: ../docker-compose-gcloud +description: Helm Chart for running the Federator/Orchestrator for the FLTK framework +version: 0.1.0 +apiVersion: v1 +appVersion: 1.17.0 +keywords: + - Orchestrator + - FLTK +sources: +home: diff --git a/charts/orchestrator/templates/fl-server-clusterrole-binding.yaml b/charts/orchestrator/templates/fl-server-clusterrole-binding.yaml new file mode 100644 index 00000000..4aa4c7cb --- /dev/null +++ b/charts/orchestrator/templates/fl-server-clusterrole-binding.yaml @@ -0,0 +1,12 @@ +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: fltk-administrator-binding +subjects: + - kind: ServiceAccount + name: default + namespace: {{ .Release.Namespace }} +roleRef: + kind: ClusterRole + name: fltk-administrator-role + apiGroup: rbac.authorization.k8s.io diff --git a/charts/orchestrator/templates/fl-server-clusterrole.yaml b/charts/orchestrator/templates/fl-server-clusterrole.yaml new file mode 100644 index 00000000..7066284f --- /dev/null +++ b/charts/orchestrator/templates/fl-server-clusterrole.yaml @@ -0,0 +1,12 @@ +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: fltk-administrator-role +# Currently, we allow every action with this ClusterRole, these could be restricted if deemed necessary +rules: + - apiGroups: + - '*' + resources: + - '*' + verbs: + - '*' diff --git a/charts/orchestrator/templates/fl-server-pod.yaml b/charts/orchestrator/templates/fl-server-pod.yaml new file mode 100644 index 00000000..30e00709 --- /dev/null +++ b/charts/orchestrator/templates/fl-server-pod.yaml @@ -0,0 +1,38 @@ +apiVersion: v1 +kind: Pod +metadata: + labels: + app.kubernetes.io/name: "fltk.orchestrator" + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/managed-by: {{ .Release.Service }} + fltk.service: fl-server + name: fl-server +spec: + containers: + - command: + - python3 + - -m + - fltk + args: + - cluster + - configs/{{ .Values.orchestrator.configurationFile }} + env: + - name: PYTHONUNBUFFERED + value: "1" + - name: IMAGE_NAME + value: {{ .Values.provider.domain }}/{{ .Values.provider.projectName }}/{{ .Values.provider.imageName }} + image: {{ .Values.provider.domain }}/{{ .Values.provider.projectName }}/{{ .Values.provider.imageName }} + name: federation-lab-server + resources: + limits: + cpu: {{ (.Values.orchestrator.cpu | int) }} + memory: {{ (.Values.orchestrator.memory | int) }} + volumeMounts: + - mountPath: /opt/federation-lab/output + name: fl-server-claim + readOnly: true + restartPolicy: Never + volumes: + - name: fl-server-claim + persistentVolumeClaim: + claimName: fl-server-claim diff --git a/charts/orchestrator/values.yaml b/charts/orchestrator/values.yaml new file mode 100644 index 00000000..ad94b4a1 --- /dev/null +++ b/charts/orchestrator/values.yaml @@ -0,0 +1,4 @@ +orchestrator: + cpu: 1000m + memory: 2000000000 + configurationFile: example_cloud_experiment.json \ No newline at end of file diff --git a/configs/example_cloud_experiment.json b/configs/example_cloud_experiment.json new file mode 100644 index 00000000..b6055ff6 --- /dev/null +++ b/configs/example_cloud_experiment.json @@ -0,0 +1,35 @@ +{ + "cluster": { + "orchestrator": { + "wait_for_clients": true, + "service": "fl-server.test.svc.cluster.local", + "nic": "eth0" + }, + "client": { + "prefix": "client", + "tensorboard_active": false + }, + "image": "gcr.io/test-bed-distml/fltk:latest" + }, + "execution_config": { + "duration": 3600, + "experiment_prefix": "cloud_experiment", + "cuda": false, + "tensorboard": { + "active": true, + "record_dir": "logging" + }, + "net": { + "save_model": false, + "save_temp_model": false, + "save_epoch_interval": 1, + "save_model_path": "models", + "epoch_save_start_suffix": "start", + "epoch_save_end_suffix": "end" + }, + "reproducibility": { + "torch_seed": 42, + "arrival_seed": 123 + } + } +} \ No newline at end of file diff --git a/configs/experiment.yaml b/configs/experiment.yaml deleted file mode 100644 index a59786bf..00000000 --- a/configs/experiment.yaml +++ /dev/null @@ -1,19 +0,0 @@ ---- -# Experiment configuration -total_epochs: 5 -epochs_per_cycle: 1 -wait_for_clients: true -net: Cifar10CNN -dataset: cifar10 -# Use cuda is available; setting to false will force CPU -cuda: true -experiment_prefix: 'experiment_sample' -output_location: 'output' -tensor_board_active: true -clients_per_round: 1 -system: - federator: - hostname: '131.180.40.72' - nic: 'wlo1' - clients: - amount: 1 diff --git a/configs/experiment_gcp_c20.yaml b/configs/experiment_gcp_c20.yaml deleted file mode 100644 index 193d61c9..00000000 --- a/configs/experiment_gcp_c20.yaml +++ /dev/null @@ -1,19 +0,0 @@ ---- -# Experiment configuration -total_epochs: 50 -epochs_per_cycle: 1 -wait_for_clients: true -net: FashionMNISTCNN -dataset: fashion-mnist -# Use cuda is available; setting to false will force CPU -cuda: true -experiment_prefix: 'ex-gcp-c20' -output_location: 'output' -tensor_board_active: true -clients_per_round: 10 -system: - federator: - hostname: '192.168.0.129' - nic: 'ens4' - clients: - amount: 20 diff --git a/configs/experiment_gcp_single.yaml b/configs/experiment_gcp_single.yaml deleted file mode 100644 index 19d1d03c..00000000 --- a/configs/experiment_gcp_single.yaml +++ /dev/null @@ -1,19 +0,0 @@ ---- -# Experiment configuration -total_epochs: 5 -epochs_per_cycle: 1 -wait_for_clients: true -net: Cifar10CNN -dataset: cifar10 -# Use cuda is available; setting to false will force CPU -cuda: true -experiment_prefix: 'experiment_single_machine' -output_location: 'output' -tensor_board_active: true -clients_per_round: 1 -system: - federator: - hostname: '131.180.40.72' - nic: 'ens4' - clients: - amount: 1 diff --git a/configs/non_iid_experiment.yaml b/configs/non_iid_experiment.yaml deleted file mode 100644 index 1bf59ed8..00000000 --- a/configs/non_iid_experiment.yaml +++ /dev/null @@ -1,23 +0,0 @@ ---- -# Experiment configuration -total_epochs: 30 -epochs_per_cycle: 1 -wait_for_clients: true -net: Cifar10CNN -dataset: cifar10 -# Use cuda is available; setting to false will force CPU -cuda: false -experiment_prefix: 'experiment_non_iid(dirichlet)' -output_location: 'output' -tensor_board_active: true -clients_per_round: 4 -sampler: "dirichlet" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default) -sampler_args: - - 0.07 # label limit || q probability || alpha || unused - - 42 # random seed || random seed || random seed || unused -system: - federator: - hostname: '192.168.1.108' - nic: 'wlp4s0' - clients: - amount: 10 # must be multiple of the number of labels for q-sampler and limit-labels diff --git a/configs/quantities/kubernetes.conf b/configs/quantities/kubernetes.conf new file mode 100644 index 00000000..cab9b9b3 --- /dev/null +++ b/configs/quantities/kubernetes.conf @@ -0,0 +1,16 @@ +kmemunits = 1 = [kmemunits] +Ki = 1024 * kmemunits +Mi = Ki^2 +Gi = Ki^3 +Ti = Ki^4 +Pi = Ki^5 +Ei = Ki^6 + +kcpuunits = 1 = [kcpuunits] +m = 1/1000 * kcpuunits +k = 1000 * kcpuunits +M = k^2 +G = k^3 +T = k^4 +P = k^5 +E = k^6 \ No newline at end of file diff --git a/configs/tasks/example_arrival_config.json b/configs/tasks/example_arrival_config.json new file mode 100644 index 00000000..2a75c84a --- /dev/null +++ b/configs/tasks/example_arrival_config.json @@ -0,0 +1,34 @@ +[ + { + "jobClassParameters": [ + { + "networkConfiguration": { + "network": "FashionMNISTCNN", + "dataset": "MNIST" + }, + "systemParameters": { + "dataParallelism": "1", + "executorCores": "750m", + "executorMemory": "1Gi", + "action": "train" + }, + "hyperParameters": { + "batchSize": "128", + "maxEpoch": "5", + "learningRate": "0.01", + "learningrateDecay": "0.0002" + }, + "classProbability": 0.1, + "priorities": [ + { + "priority": 1, + "probability": 1 + } + ] + } + ], + "lambda": 20, + "preemptJobs": 0 + } +] + diff --git a/data/.gitkeep b/data/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/deploy/templates/client_stub_default.yml b/deploy/templates/client_stub_default.yml deleted file mode 100644 index d65e2624..00000000 --- a/deploy/templates/client_stub_default.yml +++ /dev/null @@ -1,21 +0,0 @@ -client_name: # name can be anything -# container_name: federation-lab-client2 # what the name for this container would be - restart: "no" # if it crashes for example - build: . # look for the docker file where this file is currently located - volumes: - - ./docker_data:/opt/federation-lab/data - - ./default_models:/opt/federation-lab/default_models - - ./data_loaders:/opt/federation-lab/data_loaders - environment: - - PYTHONUNBUFFERED=1 - - RANK={rank} - - WORLD_SIZE={world_size} - ports: - - "5002:5000" # {machine-port}:{docker-port} - depends_on: - - "fl_server" - deploy: - resources: - limits: - cpus: '1.25' - memory: 1024M diff --git a/deploy/templates/client_stub_medium.yml b/deploy/templates/client_stub_medium.yml deleted file mode 100644 index 7083a3b6..00000000 --- a/deploy/templates/client_stub_medium.yml +++ /dev/null @@ -1,21 +0,0 @@ -client_name: # name can be anything -# container_name: federation-lab-client2 # what the name for this container would be - restart: "no" # if it crashes for example - build: . # look for the docker file where this file is currently located - volumes: - - ./docker_data:/opt/federation-lab/data - - ./default_models:/opt/federation-lab/default_models - - ./data_loaders:/opt/federation-lab/data_loaders - environment: - - PYTHONUNBUFFERED=1 - - RANK={rank} - - WORLD_SIZE={world_size} - ports: - - "5002:5000" # {machine-port}:{docker-port} - depends_on: - - "fl_server" - deploy: - resources: - limits: - cpus: '0.75' - memory: 1024M diff --git a/deploy/templates/client_stub_slow.yml b/deploy/templates/client_stub_slow.yml deleted file mode 100644 index 03a3fe48..00000000 --- a/deploy/templates/client_stub_slow.yml +++ /dev/null @@ -1,21 +0,0 @@ -client_name: # name can be anything -# container_name: federation-lab-client2 # what the name for this container would be - restart: "no" # if it crashes for example - build: . # look for the docker file where this file is currently located - volumes: - - ./docker_data:/opt/federation-lab/data - - ./default_models:/opt/federation-lab/default_models - - ./data_loaders:/opt/federation-lab/data_loaders - environment: - - PYTHONUNBUFFERED=1 - - RANK={rank} - - WORLD_SIZE={world_size} - ports: - - "5002:5000" # {machine-port}:{docker-port} - depends_on: - - "fl_server" - deploy: - resources: - limits: - cpus: '0.5' - memory: 1024M diff --git a/deploy/templates/system_stub.yml b/deploy/templates/system_stub.yml deleted file mode 100644 index eda5fc6d..00000000 --- a/deploy/templates/system_stub.yml +++ /dev/null @@ -1,23 +0,0 @@ -# creating a multi-container docker -version: "3.3" -services: - fl_server: # name can be anything - container_name: federation-lab-server # what the name for this container would be - restart: "no" # if it crashes for example - build: . # look for the docker file where this file is currently located - volumes: -# - ./data/MNIST:/opt/federation-lab/data/MNIST - - ./output:/opt/federation-lab/output - environment: - - PYTHONUNBUFFERED=1 - - RANK=0 - - WORLD_SIZE={world_size} - ports: - - "5000:5000" # {machine-port}:{docker-port} - networks: - default: - ipv4_address: 10.5.0.11 -networks: - default: - external: - name: local_network_dev \ No newline at end of file diff --git a/docs/google-cloud-platform.MD b/docs/google-cloud-platform.MD deleted file mode 100644 index e1ba01b9..00000000 --- a/docs/google-cloud-platform.MD +++ /dev/null @@ -1,71 +0,0 @@ -# Experiment on Google Cloud Platform - -The commands in this manual make use of the `gcloud` cli tool. -Make sure that this is installed on your machine. -The commands are tested on Ubuntu 20.04, but it should work on most Unix systems and on Windows WSL. - -The console can be found at [https://console.cloud.google.com/](https://console.cloud.google.com/) - -* Enable the Compute Engine in the console. - -## One Client example -This example creates one federator and one client instance. - -Start by creating two VM instances on the `Compute Engine`. - -1. Create the Federator VM instance with the following properties: - -| Property | Value | -| ------------- |:-------------:| -| Name | `federator` | -| Region | `europe-west4` | -| Machine Family | General Compute | -| Machine series | `N1` | -| Machine Type | `g1-small` -| Boot disk | Ubuntu 20.04 | -| Boot disk type | Balanced Persistent Disk | -| Disk Size| 10 GB | -| Firewall | `Allow HTTP traffic` and `Allow HTTPS traffic ` | -| Networking External ip | Ephemeral | - -2. Create the Client VM instance with the following properties: - -| Property | Value | -| ------------- |:-------------:| -| Name | `client1` | -| Region | `europe-west4` | -| Machine Family | General Compute | -| Machine series | `N1` | -| Machine Type | `g1-small` -| Boot disk | Ubuntu 20.04 | -| Boot disk type | Balanced Persistent Disk | -| Disk Size| 10 GB | -| Firewall | `Allow HTTP traffic` and `Allow HTTPS traffic ` | -| Networking External ip | Ephemeral | - -3. Once the VMs have started, SSH into the VMs and follow the next steps: - - * `sudo apt update` - * `git clone https://github.com/bacox/fltk.git fltk` - * `cd fltk` - * `python3 setup.py install` - -Make sure you have done this at both the `federator` and the `client1` VM instance. - -4. Get the ip address of the `federator` instance: - - * SSH into the `federator` instance - * Execute `ip a` in the shell - * The ip address is probally on NIC `ens4` - -5. On the `federator` instance (via SSH) start the system by executing from the `fltk` folder: - ```bash - python3 -m fltk single configs/experiment.yaml --rank=0 -–nic= -–host= - ``` - -6. On the `client1` instance (via SSH) start the system by executing from the `fltk` folder: - ```bash - python3 -m fltk single configs/experiment.yaml --rank=1 -–nic= -–host= - ``` - - \ No newline at end of file diff --git a/examples/change_world_size_gcp.py b/examples/change_world_size_gcp.py deleted file mode 100644 index 8889b19c..00000000 --- a/examples/change_world_size_gcp.py +++ /dev/null @@ -1,113 +0,0 @@ -import argparse -import os -import time -import googleapiclient.discovery -from googleapiclient.errors import HttpError - -def update_startup_script(compute, project, zone, name_template, rank, world_size, host, nic, region): - instance_name = name_template.format(rank=rank) - startup_script = open( - os.path.join( - os.path.dirname(__file__), 'startup-script_template.sh'), 'r').read() - startup_args = { - 'rank_arg': rank, - 'world_size_arg': world_size, - 'host_arg': host, - 'nic_arg': nic - } - instanceget = compute.instances().get(project=project, zone=zone, instance=instance_name).execute() - - fingerprint = instanceget['metadata']['fingerprint'] - instance_id = instanceget['id'] - # Insert values for startup script in template - startup_script = startup_script.format(**startup_args) - client_body = { - "fingerprint": fingerprint, - "items": [ - { - "key": "startup-script", - "value": startup_script - } - ] - } - print(f'Changing startup script of instance {instance_name}') - return compute.instances().setMetadata( - project=project, - zone=zone, - instance=instance_id, - body=client_body).execute() - -# [START list_instances] -def list_instances(compute, project, zone): - result = compute.instances().list(project=project, zone=zone).execute() - - result2 = compute.machineImages().list(project=project).execute() - print(result2) - return result['items'] if 'items' in result else None -# [END list_instances] - -# [START wait_for_operation] -def wait_for_operation(compute, project, zone, operation): - print('Waiting for operation to finish...') - while True: - result = compute.zoneOperations().get( - project=project, - zone=zone, - operation=operation).execute() - - if result['status'] == 'DONE': - print("done.") - if 'error' in result: - raise Exception(result['error']) - return result - - time.sleep(1) -# [END wait_for_operation] - - - -if __name__ == "__main__": - - parser = argparse.ArgumentParser(description='Change the world-size of VMs in GCP') - parser.add_argument('--num_clients', type=int, default=20, help='The number of clients (excluding the Federator) in the system') - parser.add_argument('--project', type=str, default='tud-federated-learning', help='The Google Cloud Platform project name') - args = parser.parse_args() - - num_clients = args.num_clients - project_name = args.project - - # Change these values if desired - region = 'europe-west4' - zone_name = f'{region}-a' - instance_name='tud-federated-learning-automated-instance' - name_template = 'tud-fl-client-{rank}' - name_template_federator = 'tud-fl-federator-{rank}' - - # The world size is number of clients + 1 - world_size = num_clients + 1 - nic = 'ens4' # Default nic in GCP ubuntu machines - - # Create GCP API instance - compute = googleapiclient.discovery.build('compute', 'beta') - instances = list_instances(compute, project_name, zone_name) - federator_ip = [x['networkInterfaces'][0]['networkIP'] for x in instances if x['name']==name_template_federator.format(rank=0)][0] - host = federator_ip - - ############################ - ## Alter Clients metadata ## - ############################ - operations = [] - for id in range(1, num_clients+1): - try: - operations.append(update_startup_script(compute, project_name, zone_name, name_template, id, world_size, host, nic, region)) - except HttpError as http_error: - if http_error.status_code == 409 and http_error.error_details[0]['reason'] == 'alreadyExists': - print('Resource already exists, continue with the next') - continue - else: - raise http_error - for operation in operations: - wait_for_operation(compute, project_name, zone_name, operation['name']) - - print("""The world-size of the clients are updated""") - diff --git a/examples/deploy.py b/examples/deploy.py deleted file mode 100644 index ae51ff1a..00000000 --- a/examples/deploy.py +++ /dev/null @@ -1,269 +0,0 @@ -import argparse -import os -import time -import googleapiclient.discovery -from googleapiclient.errors import HttpError - - -def create_federator(compute, project, zone, name_template, rank, region, machine_image): - machine_type = f'zones/{zone}/machineTypes/g1-small' - instance_name = name_template.format(rank=rank) - subnetwork = f'projects/{project}/regions/{region}/subnetworks/default' - - print(instance_name) - client_config = { - "kind": "compute#instance", - "name": instance_name, - "zone": zone, - "minCpuPlatform": "Automatic", - "machineType": machine_type, - "displayDevice": { - "enableDisplay": False - }, - "metadata": { - "kind": "compute#metadata", - "items": [], - }, - "tags": { - "items": [ - "http-server", - "https-server" - ] - }, - "canIpForward": False, - "networkInterfaces": [ - { - "kind": "compute#networkInterface", - "subnetwork": subnetwork, - "accessConfigs": [ - { - "kind": "compute#accessConfig", - "name": "External NAT", - "type": "ONE_TO_ONE_NAT", - "networkTier": "PREMIUM" - } - ], - "aliasIpRanges": [] - } - ], - "description": "", - "labels": { - "experiment": "ex-c20" - }, - "scheduling": { - "preemptible": False, - "onHostMaintenance": "MIGRATE", - "automaticRestart": True, - "nodeAffinities": [] - }, - "deletionProtection": False, - "reservationAffinity": { - "consumeReservationType": "ANY_RESERVATION" - }, - "serviceAccounts": [ - { - "email": "default", - "scopes": [ - "https://www.googleapis.com/auth/devstorage.read_only", - "https://www.googleapis.com/auth/logging.write", - "https://www.googleapis.com/auth/monitoring.write", - "https://www.googleapis.com/auth/servicecontrol", - "https://www.googleapis.com/auth/service.management.readonly", - "https://www.googleapis.com/auth/trace.append" - ] - } - ], - "sourceMachineImage": machine_image, - "shieldedInstanceConfig": { - "enableSecureBoot": False, - "enableVtpm": False, - "enableIntegrityMonitoring": True - }, - "confidentialInstanceConfig": { - "enableConfidentialCompute": False - } - } - return compute.instances().insert( - project=project, - zone=zone, - body=client_config).execute() - -def create_client(compute, project, zone, name_template, rank, world_size, host, nic, region, machine_image): - machine_type = f'zones/{zone}/machineTypes/g1-small' - instance_name = name_template.format(rank=rank) - subnetwork = f'projects/{project}/regions/{region}/subnetworks/default' - startup_script = open( - os.path.join( - os.path.dirname(__file__), 'startup-script_template.sh'), 'r').read() - startup_args = { - 'rank_arg': rank, - 'world_size_arg': world_size, - 'host_arg': host, - 'nic_arg': nic - } - startup_script = startup_script.format(**startup_args) - print(instance_name) - - client_config = { - "kind": "compute#instance", - "name": instance_name, - "zone": zone, - "minCpuPlatform": "Automatic", - "machineType": machine_type, - "displayDevice": { - "enableDisplay": False - }, - "metadata": { - "kind": "compute#metadata", - "items": [ - { - # Startup script is automatically executed by the - # instance upon startup. - 'key': 'startup-script', - 'value': startup_script - } - ], - }, - "tags": { - "items": [] - }, - "canIpForward": False, - "networkInterfaces": [ - { - "kind": "compute#networkInterface", - "subnetwork": subnetwork, - "aliasIpRanges": [] - } - ], - "description": "", - "labels": { - "experiment": "ex-c20" - }, - "scheduling": { - "preemptible": False, - "onHostMaintenance": "MIGRATE", - "automaticRestart": True, - "nodeAffinities": [] - }, - "deletionProtection": False, - "reservationAffinity": { - "consumeReservationType": "ANY_RESERVATION" - }, - "serviceAccounts": [ - { - "email": "default", - "scopes": [ - "https://www.googleapis.com/auth/devstorage.read_only", - "https://www.googleapis.com/auth/logging.write", - "https://www.googleapis.com/auth/monitoring.write", - "https://www.googleapis.com/auth/servicecontrol", - "https://www.googleapis.com/auth/service.management.readonly", - "https://www.googleapis.com/auth/trace.append" - ] - } - ], - "sourceMachineImage": machine_image, - "shieldedInstanceConfig": { - "enableSecureBoot": False, - "enableVtpm": False, - "enableIntegrityMonitoring": True - }, - "confidentialInstanceConfig": { - "enableConfidentialCompute": False - } - } - return compute.instances().insert( - project=project, - zone=zone, - body=client_config).execute() - -# [START list_instances] -def list_instances(compute, project, zone): - result = compute.instances().list(project=project, zone=zone).execute() - - result2 = compute.machineImages().list(project=project).execute() - print(result2) - return result['items'] if 'items' in result else None -# [END list_instances] - -# [START wait_for_operation] -def wait_for_operation(compute, project, zone, operation): - print('Waiting for operation to finish...') - while True: - result = compute.zoneOperations().get( - project=project, - zone=zone, - operation=operation).execute() - - if result['status'] == 'DONE': - print("done.") - if 'error' in result: - raise Exception(result['error']) - return result - - time.sleep(1) -# [END wait_for_operation] - - - -if __name__ == "__main__": - - parser = argparse.ArgumentParser(description='Create VMs in GCP for Federated Learning') - parser.add_argument('--num_clients', type=int, default=20, - help='The number of clients (excluding the Federator) in the system') - parser.add_argument('--project', type=str, default='tud-federated-learning', - help='The Google Cloud Platform project name') - parser.add_argument('--machine_image', type=str, default='c20-machine-image', - help='The Google Cloud Platform project name') - args = parser.parse_args() - - num_clients = args.num_clients - project_name = args.project - machine_image_name = args.machine_image - - # Change these values if desired - region = 'europe-west4' - zone_name = f'{region}-a' - instance_name='tud-federated-learning-automated-instance' - name_template = 'tud-fl-client-{rank}' - name_template_federator = 'tud-fl-federator-{rank}' - world_size = num_clients + 1 - nic = 'ens4' # Default nic in GCP ubuntu machines - machine_image = f'projects/{project_name}/global/machineImages/{machine_image_name}' - compute = googleapiclient.discovery.build('compute', 'beta') - - ###################### - ## Create Federator ## - ###################### - try: - federator_operation = create_federator(compute, project_name, zone_name, name_template_federator, 0, region, machine_image) - wait_for_operation(compute, project_name, zone_name, federator_operation['name']) - except HttpError as http_error: - if http_error.status_code == 409 and http_error.error_details[0]['reason'] == 'alreadyExists': - print('Resource already exists, continue with the next') - else: - raise http_error - - instances = list_instances(compute, project_name, zone_name) - federator_ip = [x['networkInterfaces'][0]['networkIP'] for x in instances if x['name']==name_template_federator.format(rank=0)][0] - host = federator_ip - - #################### - ## Create Clients ## - #################### - operations = [] - for id in range(1, num_clients+1): - try: - operations.append(create_client(compute, project_name, zone_name, name_template, id, world_size, host, nic, region, machine_image)) - wait_for_operation(compute, project_name, zone_name, operations[-1]['name']) - except HttpError as http_error: - if http_error.status_code == 409 and http_error.error_details[0]['reason'] == 'alreadyExists': - print('Resource already exists, continue with the next') - continue - else: - raise http_error - for operation in operations: - wait_for_operation(compute, project_name, zone_name, operation['name']) - - print("""Now login via ssh into the federator VM and start the experiment.""") - diff --git a/examples/startup-script_template.sh b/examples/startup-script_template.sh deleted file mode 100644 index 2a709325..00000000 --- a/examples/startup-script_template.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -python3 -m fltk remote --rank={rank_arg} --world_size={world_size_arg} --host={host_arg} --nic={nic_arg} \ No newline at end of file diff --git a/fltk/__main__.py b/fltk/__main__.py index ac8cf05c..c6332429 100644 --- a/fltk/__main__.py +++ b/fltk/__main__.py @@ -1,82 +1,72 @@ -import os -import sys -import torch.distributed.rpc as rpc +import json import logging +from argparse import Namespace, ArgumentParser +from pathlib import Path -import yaml -import argparse +from fltk.launch import launch_client, launch_orchestrator, launch_extractor +from fltk.util.config.arguments import create_client_parser, create_cluster_parser, extract_learning_parameters, \ + create_extractor_parser +from fltk.util.config.base_config import BareConfig -import torch.multiprocessing as mp -from fltk.federator import Federator -from fltk.launch import run_single, run_spawn -from fltk.util.base_config import BareConfig -logging.basicConfig(level=logging.DEBUG) +def __main__(): + parser = ArgumentParser(description='Experiment launcher for the Federated Learning Testbed') + subparsers = parser.add_subparsers(dest="mode") + create_client_parser(subparsers) + create_cluster_parser(subparsers) + create_extractor_parser(subparsers) + """ + To create your own parser mirror the construction in the 'client_parser' object. + Or refer to the ArgumentParser library documentation. + """ -def add_default_arguments(parser): - parser.add_argument('--world_size', type=str, default=None, - help='Number of entities in the world. This is the number of clients + 1') + arguments = parser.parse_args() -def main(): - parser = argparse.ArgumentParser(description='Experiment launcher for the Federated Learning Testbed') + with open(arguments.config, 'r') as config_file: + config: BareConfig = BareConfig.from_dict(json.load(config_file)) + config.config_path = Path(arguments.config) - subparsers = parser.add_subparsers(dest="mode") + if arguments.mode == 'cluster': + logging.info("Starting in cluster mode.") + cluster_start(arguments, config) + elif arguments.mode == 'client': + logging.info("Starting in client mode") + client_start(arguments, config) + logging.info("Stopping client...") + exit(0) + elif arguments.mode == 'extractor': + launch_extractor(arguments, config) + else: + print("Provided mode is not supported...") + exit(1) - single_parser = subparsers.add_parser('single') - single_parser.add_argument('config', type=str) - single_parser.add_argument('--rank', type=int) - single_parser.add_argument('--nic', type=str, default=None) - single_parser.add_argument('--host', type=str, default=None) - add_default_arguments(single_parser) - spawn_parser = subparsers.add_parser('spawn') - spawn_parser.add_argument('config', type=str) - add_default_arguments(spawn_parser) +def cluster_start(args: Namespace, configuration: BareConfig): + """ + Function to to launch Orchestrator for execution with provided configurations. Currently + this assumes that a single Orchestrator is started that manages all the resources in the cluster. + """ + logging.basicConfig(level=logging.DEBUG, + datefmt='%m-%d %H:%M') + # Set the seed for arrivals, torch seed is mostly ignored. Set the `arrival_seed` to a different value + # for each repetition that you want to run an experiment with. + configuration.set_seed() + launch_orchestrator(args=args, conf=configuration) - remote_parser = subparsers.add_parser('remote') - remote_parser.add_argument('--rank', type=int) - remote_parser.add_argument('--nic', type=str, default=None) - remote_parser.add_argument('--host', type=str, default=None) - add_default_arguments(remote_parser) - args = parser.parse_args() - if args.mode == 'remote': - if args.rank is None or args.host is None or args.world_size is None or args.nic is None: - print('Missing rank, host, world-size, or nic argument when in \'remote\' mode!') - parser.print_help() - exit(1) - world_size = int(args.world_size) - master_address = args.host - nic = args.nic - rank = int(args.rank) - if rank == 0: - print('Remote mode only supports ranks > 0!') - exit(1) - print(f'rank={args.rank}, world_size={world_size}, host={master_address}, args=None, nic={nic}') - run_single(rank=args.rank, world_size=world_size, host=master_address, args=None, nic=nic) - else: - with open(args.config) as file: - cfg = BareConfig() - yaml_data = yaml.load(file, Loader=yaml.FullLoader) - cfg.merge_yaml(yaml_data) - if args.mode == 'single': - if args.rank is None: - print('Missing rank argument when in \'single\' mode!') - parser.print_help() - exit(1) - world_size = args.world_size - master_address = args.host - nic = args.nic - if not world_size: - world_size = yaml_data['system']['clients']['amount'] + 1 - if not master_address: - master_address = yaml_data['system']['federator']['hostname'] - if not nic: - nic = yaml_data['system']['federator']['nic'] - print(f'rank={args.rank}, world_size={world_size}, host={master_address}, args=cfg, nic={nic}') - run_single(rank=args.rank, world_size=world_size, host=master_address, args=cfg, nic=nic) - else: - run_spawn(cfg) +def client_start(args: Namespace, configuration: BareConfig): + learning_params = extract_learning_parameters(args) + # Set the seed for PyTorch, numpy seed is mostly ignored. Set the `torch_seed` to a different value + # for each repetition that you want to run an experiment with. + configuration.set_seed() + task_id = args.task_id + launch_client(task_id, config=configuration, learning_params=learning_params, namespace=args) + if __name__ == "__main__": - main() \ No newline at end of file + root = logging.getLogger() + if root.handlers: + for handler in root.handlers: + root.removeHandler(handler) + logging.basicConfig(format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d-%Y %H:%M:%S',) + __main__() diff --git a/fltk/client.py b/fltk/client.py index de3095e3..0a1f7c9d 100644 --- a/fltk/client.py +++ b/fltk/client.py @@ -1,307 +1,278 @@ -import copy import datetime -import os -import random -import time -from dataclasses import dataclass -from typing import List - -import torch -from torch.distributed import rpc import logging +from pathlib import Path +from typing import List, Tuple + import numpy as np +import torch +import torch.distributed as dist from sklearn.metrics import confusion_matrix -from sklearn.metrics import classification_report -from fltk.schedulers import MinCapableStepLR -from fltk.util.arguments import Arguments -from fltk.util.log import FLLogger - -import yaml +from torch.utils.tensorboard import SummaryWriter +from fltk.nets.util import calculate_class_precision, calculate_class_recall, save_model, load_model_from_file +from fltk.schedulers import MinCapableStepLR, LearningScheduler +from fltk.util.config.arguments import LearningParameters +from fltk.util.config.base_config import BareConfig from fltk.util.results import EpochData -logging.basicConfig(level=logging.DEBUG) - - - -def _call_method(method, rref, *args, **kwargs): - """helper for _remote_method()""" - return method(rref.local_value(), *args, **kwargs) - -def _remote_method(method, rref, *args, **kwargs): - """ - executes method(*args, **kwargs) on the from the machine that owns rref - - very similar to rref.remote().method(*args, **kwargs), but method() doesn't have to be in the remote scope - """ - args = [method, rref] + list(args) - return rpc.rpc_sync(rref.owner(), _call_method, args=args, kwargs=kwargs) - -def _remote_method_async(method, rref, *args, **kwargs): - args = [method, rref] + list(args) - return rpc.rpc_async(rref.owner(), _call_method, args=args, kwargs=kwargs) - -class Client: - counter = 0 - finished_init = False - dataset = None - epoch_results: List[EpochData] = [] - epoch_counter = 0 - - - def __init__(self, id, log_rref, rank, world_size, config = None): - logging.info(f'Welcome to client {id}') - self.id = id - self.log_rref = log_rref - self.rank = rank - self.world_size = world_size - # self.args = Arguments(logging) - self.args = config - self.args.init_logger(logging) - self.device = self.init_device() - self.set_net(self.load_default_model()) - self.loss_function = self.args.get_loss_function()() - self.optimizer = torch.optim.SGD(self.net.parameters(), - lr=self.args.get_learning_rate(), - momentum=self.args.get_momentum()) - self.scheduler = MinCapableStepLR(self.args.get_logger(), self.optimizer, - self.args.get_scheduler_step_size(), - self.args.get_scheduler_gamma(), - self.args.get_min_lr()) - - def init_device(self): - if self.args.cuda and torch.cuda.is_available(): - return torch.device("cuda:0") - else: - return torch.device("cpu") - - def ping(self): - return 'pong' - - def rpc_test(self): - sleep_time = random.randint(1, 5) - time.sleep(sleep_time) - self.local_log(f'sleep for {sleep_time} seconds') - self.counter += 1 - log_line = f'Number of times called: {self.counter}' - self.local_log(log_line) - self.remote_log(log_line) - - def remote_log(self, message): - _remote_method_async(FLLogger.log, self.log_rref, self.id, message, time.time()) - - def local_log(self, message): - logging.info(f'[{self.id}: {time.time()}]: {message}') - - def set_configuration(self, config: str): - yaml_config = yaml.safe_load(config) - - def init(self): - pass - - def init_dataloader(self, ): - self.args.distributed = True - self.args.rank = self.rank - self.args.world_size = self.world_size - # self.dataset = DistCIFAR10Dataset(self.args) - self.dataset = self.args.DistDatasets[self.args.dataset_name](self.args) - self.finished_init = True - logging.info('Done with init') - - def is_ready(self): - return self.finished_init - - def set_net(self, net): - self.net = net - self.net.to(self.device) - - def load_model_from_file(self, model_file_path): - model_class = self.args.get_net() - default_model_path = os.path.join(self.args.get_default_model_folder_path(), model_class.__name__ + ".model") - return self.load_model_from_file(default_model_path) - - def get_nn_parameters(self): - """ - Return the NN's parameters. - """ - return self.net.state_dict() - def load_default_model(self): - """ - Load a model from default model file. +class Client(object): - This is used to ensure consistent default model behavior. + def __init__(self, rank: int, task_id: str, world_size: int, config: BareConfig = None, + learning_params: LearningParameters = None): """ - model_class = self.args.get_net() - default_model_path = os.path.join(self.args.get_default_model_folder_path(), model_class.__name__ + ".model") - - return self.load_model_from_file(default_model_path) - - def load_model_from_file(self, model_file_path): + @param rank: PyTorch rank provided by KubeFlow setup. + @type rank: int + @param task_id: String id representing the UID of the training task + @type task_id: str + @param config: Parsed configuration file representation to extract runtime information from. + @type config: BareConfig + @param learning_params: Hyper-parameter configuration to be used during the training process by the learner. + @type learning_params: LearningParameters """ - Load a model from a file. + self._logger = logging.getLogger(f'Client-{rank}-{task_id}') - :param model_file_path: string - """ - model_class = self.args.get_net() - model = model_class() + self._logger.info("Initializing learning client") + self._id = rank + self._world_size = world_size + self._task_id = task_id - if os.path.exists(model_file_path): - try: - model.load_state_dict(torch.load(model_file_path)) - except: - self.args.get_logger().warning("Couldn't load model. Attempting to map CUDA tensors to CPU to solve error.") + self.config = config + self.learning_params = learning_params - model.load_state_dict(torch.load(model_file_path, map_location=torch.device('cpu'))) - else: - self.args.get_logger().warning("Could not find model: {}".format(model_file_path)) + # Create model and dataset + self.loss_function = self.learning_params.get_loss()() + self.dataset = self.learning_params.get_dataset_class()(self.config, self.learning_params, self._id, + self._world_size) + self.model = self.learning_params.get_model_class()() + self.device = self._init_device() - return model + self.optimizer: torch.optim.Optimizer + self.scheduler: LearningScheduler + self.tb_writer: SummaryWriter - def get_client_index(self): + def prepare_learner(self, distributed: bool = False) -> None: """ - Returns the client index. + Function to prepare the learner, i.e. load all the necessary data into memory. + @param distributed: Indicates whether the execution must be run in Distributed fashion with DDP. + @type distributed: bool + @param backend: Which backend to use during training, needed when executing in distributed fashion, + for CPU execution the GLOO (default) backend must be used. For GPU execution, the NCCL execution is needed. + @type backend: dist.Backend + @return: None + @rtype: None """ - return self.client_idx - - def update_nn_parameters(self, new_params): + self._logger.info(f"Preparing learner model with distributed={distributed}") + self.model.to(self.device) + if distributed: + self.model = torch.nn.parallel.DistributedDataParallel(self.model) + + # Currently it is assumed to use an SGD optimizer. **kwargs need to be used to launch this properly + self.optimizer = self.learning_params.get_optimizer()(self.model.parameters(), + lr=self.learning_params.learning_rate, + momentum=0.9) + self.scheduler = MinCapableStepLR(self.optimizer, + self.config.get_scheduler_step_size(), + self.config.get_scheduler_gamma(), + self.config.get_min_lr()) + + self.tb_writer = SummaryWriter( + str(self.config.get_log_path(self._task_id, self._id, self.learning_params.model))) + + def stop_learner(self): """ - Update the NN's parameters. + @deprecated Function to stop a learner upon command of another learner. + @return: None + @rtype: None + """ + self._logger.info(f"Tearing down Client {self._id}") + self.tb_writer.close() - :param new_params: New weights for the neural network - :type new_params: dict + def _init_device(self, cuda_device: torch.device = torch.device(f'cpu')): + """ + Initialize Torch to use available devices. Either prepares CUDA device, or disables CUDA during execution to run + with CPU only inference/training. + @param cuda_device: Torch device to use, refers to the CUDA device to be used in case there are multiple. + Defaults to the first cuda device when CUDA is enabled at index 0. + @type cuda_device: torch.device + @return: None + @rtype: None """ - self.net.load_state_dict(copy.deepcopy(new_params), strict=True) - if self.log_rref: - self.remote_log(f'Weights of the model are updated') + if self.config.cuda_enabled() and torch.cuda.is_available(): + return torch.device(dist.get_rank()) + else: + # Force usage of CPU + torch.cuda.is_available = lambda: False + return cuda_device - def train(self, epoch): + def load_default_model(self): """ - :param epoch: Current epoch # - :type epoch: int + @deprecated Load a model from default model file. This function could be used to ensure consistent default model + behavior. When using PyTorch's DistributedDataParallel, however, the first step will always synchronize the + model. """ - # self.net.train() - # save model - if self.args.should_save_model(epoch): - self.save_model(epoch, self.args.get_epoch_save_start_suffix()) + model_file = Path(f'{self.model.__name__}.model') + default_model_path = Path(self.config.get_default_model_folder_path()).joinpath(model_file) + load_model_from_file(self.model, default_model_path) + + def train(self, epoch, log_interval: int = 50): + """ + Function to start training, regardless of DistributedDataParallel (DPP) or local training. DDP will account for + synchronization of nodes. If extension requires to make use of torch.distributed.send and torch.distributed.recv + (for example for customized training or Federated Learning), additional torch.distributed.barrier calls might + be required to launch. + :param epoch: Current epoch number + :type epoch: int + @param log_interval: Iteration interval at which to log. + @type log_interval: int + """ running_loss = 0.0 final_running_loss = 0.0 - if self.args.distributed: - self.dataset.train_sampler.set_epoch(epoch) - - for i, (inputs, labels) in enumerate(self.dataset.get_train_loader(), 0): - inputs, labels = inputs.to(self.device), labels.to(self.device) - + self.model.train() + for i, (inputs, labels) in enumerate(self.dataset.get_train_loader()): # zero the parameter gradients self.optimizer.zero_grad() - # forward + backward + optimize - outputs = self.net(inputs) - loss = self.loss_function(outputs, labels) + # Forward through the net to train + outputs = self.model(inputs.to(self.device)) + + # Calculate the loss + loss = self.loss_function(outputs, labels.to(self.device)) + + # Update weights, DPP will account for synchronization of the weights. loss.backward() self.optimizer.step() - # print statistics - running_loss += loss.item() - if i % self.args.get_log_interval() == 0: - self.args.get_logger().info('[%d, %5d] loss: %.3f' % (epoch, i, running_loss / self.args.get_log_interval())) - final_running_loss = running_loss / self.args.get_log_interval() + running_loss += float(loss.detach().item()) + if i % log_interval == 0: + self._logger.info('[%d, %5d] loss: %.3f' % (epoch, i, running_loss / log_interval)) + final_running_loss = running_loss / log_interval running_loss = 0.0 - self.scheduler.step() - # save model - if self.args.should_save_model(epoch): - self.save_model(epoch, self.args.get_epoch_save_end_suffix()) + # Save model + if self.config.should_save_model(epoch): + # Note that currently this is not supported in the Framework. However, the creation of a ReadWriteMany + # PVC in the deployment charts, and mounting this in the appropriate directory, would resolve this issue. + # This can be done by copying the setup of the PVC used to record the TensorBoard information (used by + # logger created by the rank==0 node during the training process (i.e. to keep track of process). + self.save_model(epoch) - return final_running_loss, self.get_nn_parameters() - - def test(self): - self.net.eval() + return final_running_loss + def test(self) -> Tuple[float, float, np.array, np.array, np.array]: + """ + Function to test the trained model using the test dataset. Returns a number of statistics of the training + process. + @warning Currently the testing process assumes that the model performs classification, for different types of + tasks this function would need to be updated. + @return: (accuracy, loss, class_precision, class_recall, confusion_mat): class_precision, class_recal and + confusion_mat will be in a np.array, which corresponds to the nubmer of classes in a classification task. + @rtype: Tuple[float, float, np.array, np.array, np.array]: + """ correct = 0 total = 0 targets_ = [] pred_ = [] loss = 0.0 + + # Disable gradient calculation, as we are only interested in predictions with torch.no_grad(): for (images, labels) in self.dataset.get_test_loader(): images, labels = images.to(self.device), labels.to(self.device) - outputs = self.net(images) + outputs = self.model(images) + # Currently the FLTK framework assumes that a classification task is performed (hence max). + # Future work may add support for non-classification training. _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() - targets_.extend(labels.cpu().view_as(predicted).numpy()) - pred_.extend(predicted.cpu().numpy()) + targets_.extend(labels.detach().cpu().view_as(predicted).numpy()) + pred_.extend(predicted.detach().cpu().numpy()) loss += self.loss_function(outputs, labels).item() - accuracy = 100 * correct / total - confusion_mat = confusion_matrix(targets_, pred_) + accuracy = 100.0 * correct / total + confusion_mat: np.array = confusion_matrix(targets_, pred_) - class_precision = self.calculate_class_precision(confusion_mat) - class_recall = self.calculate_class_recall(confusion_mat) + class_precision: np.array = calculate_class_precision(confusion_mat) + class_recall: np.array = calculate_class_recall(confusion_mat) - self.args.get_logger().debug('Test set: Accuracy: {}/{} ({:.0f}%)'.format(correct, total, accuracy)) - self.args.get_logger().debug('Test set: Loss: {}'.format(loss)) - self.args.get_logger().debug("Classification Report:\n" + classification_report(targets_, pred_)) - self.args.get_logger().debug("Confusion Matrix:\n" + str(confusion_mat)) - self.args.get_logger().debug("Class precision: {}".format(str(class_precision))) - self.args.get_logger().debug("Class recall: {}".format(str(class_recall))) + self._logger.debug('Test set: Accuracy: {}/{} ({:.0f}%)'.format(correct, total, accuracy)) + self._logger.debug('Test set: Loss: {}'.format(loss)) + self._logger.debug("Confusion Matrix:\n" + str(confusion_mat)) + self._logger.debug("Class precision: {}".format(str(class_precision))) + self._logger.debug("Class recall: {}".format(str(class_recall))) - return accuracy, loss, class_precision, class_recall + return accuracy, loss, class_precision, class_recall, confusion_mat - def run_epochs(self, num_epoch): - start_time_train = datetime.datetime.now() - self.dataset.get_train_sampler().set_epoch_size(num_epoch) - loss, weights = self.train(self.epoch_counter) - self.epoch_counter += num_epoch - elapsed_time_train = datetime.datetime.now() - start_time_train - train_time_ms = int(elapsed_time_train.total_seconds()*1000) - - start_time_test = datetime.datetime.now() - accuracy, test_loss, class_precision, class_recall = self.test() - elapsed_time_test = datetime.datetime.now() - start_time_test - test_time_ms = int(elapsed_time_test.total_seconds()*1000) - - data = EpochData(self.epoch_counter, train_time_ms, test_time_ms, loss, accuracy, test_loss, class_precision, class_recall, client_id=self.id) - self.epoch_results.append(data) - - # Copy GPU tensors to CPU - for k, v in weights.items(): - weights[k] = v.cpu() - return data, weights - - def save_model(self, epoch, suffix): + def run_epochs(self) -> List[EpochData]: """ - Saves the model if necessary. + Function to run training epochs using the pre-set Hyper-Parameters. + @return: A list of data gathered during the execution, containing progress information such as accuracy. See also + EpochData. + @rtype: List[EpochData] """ - self.args.get_logger().debug("Saving model to flat file storage. Save #{}", epoch) - - if not os.path.exists(self.args.get_save_model_folder_path()): - os.mkdir(self.args.get_save_model_folder_path()) - - full_save_path = os.path.join(self.args.get_save_model_folder_path(), "model_" + str(self.client_idx) + "_" + str(epoch) + "_" + suffix + ".model") - torch.save(self.get_nn_parameters(), full_save_path) - - def calculate_class_precision(self, confusion_mat): + max_epoch = self.learning_params.max_epoch + 1 + start_time_train = datetime.datetime.now() + epoch_results = [] + for epoch in range(1, max_epoch): + train_loss = self.train(epoch) + + # Let only the 'master node' work on training. Possibly DDP can be used + # to have a distributed test loader as well to speed up (would require + # aggregation of data. + # Example https://github.com/fabio-deep/Distributed-Pytorch-Boilerplate/blob/0206247150720ca3e287e9531cb20ef68dc9a15f/src/datasets.py#L271-L303. + elapsed_time_train = datetime.datetime.now() - start_time_train + train_time_ms = int(elapsed_time_train.total_seconds() * 1000) + + start_time_test = datetime.datetime.now() + accuracy, test_loss, class_precision, class_recall, confusion_mat = self.test() + + elapsed_time_test = datetime.datetime.now() - start_time_test + test_time_ms = int(elapsed_time_test.total_seconds() * 1000) + + data = EpochData(epoch_id=epoch, + duration_train=train_time_ms, + duration_test=test_time_ms, + loss_train=train_loss, + accuracy=accuracy, + loss=test_loss, + class_precision=class_precision, + class_recall=class_recall, + confusion_mat=confusion_mat) + + epoch_results.append(data) + if self._id == 0: + self.log_progress(data, epoch) + return epoch_results + + def save_model(self, epoch): """ - Calculates the precision for each class from a confusion matrix. + @deprecated Move function to utils directory. """ - return np.diagonal(confusion_mat) / np.sum(confusion_mat, axis=0) + self._logger.debug(f"Saving model to flat file storage. Saved at epoch #{epoch}") + save_model(self.model, self.config.get_save_model_folder_path(), epoch) - def calculate_class_recall(self, confusion_mat): + def log_progress(self, epoch_data: EpochData, epoch: int): """ - Calculates the recall for each class from a confusion matrix. + Function to log the progress of the learner between epochs. Only the MASTER/RANK=0 process should call this + function. Other learners' SummaryWriters data will be gone after the pod reached 'Completed' status. + @param epoch_data: data object which needs to be logged with the learners SummaryWriter. + @type epoch_data: EpochData + @param epoch: Number of the epoch. + @type epoch: int + @return: None + @rtype: None """ - return np.diagonal(confusion_mat) / np.sum(confusion_mat, axis=1) - def get_client_datasize(self): - return len(self.dataset.get_train_sampler()) + self.tb_writer.add_scalar('training loss per epoch', + epoch_data.loss_train, + epoch) - def __del__(self): - print(f'Client {self.id} is stopping') + self.tb_writer.add_scalar('accuracy per epoch', + epoch_data.accuracy, + epoch) diff --git a/fltk/datasets/__init__.py b/fltk/datasets/__init__.py index 12141278..38534a4d 100644 --- a/fltk/datasets/__init__.py +++ b/fltk/datasets/__init__.py @@ -1 +1,4 @@ -from .distributed import * +from .cifar10 import CIFAR10Dataset +from .cifar100 import CIFAR100Dataset +from .fashion_mnist import FashionMNISTDataset +from .mnist import MNIST \ No newline at end of file diff --git a/fltk/datasets/cifar10.py b/fltk/datasets/cifar10.py index 82e375e4..8b6d763a 100644 --- a/fltk/datasets/cifar10.py +++ b/fltk/datasets/cifar10.py @@ -1,17 +1,16 @@ -from .dataset import Dataset +from torch.utils.data import DataLoader, DistributedSampler from torchvision import datasets from torchvision import transforms -from torch.utils.data import DataLoader, DistributedSampler +from .dataset import Dataset -class CIFAR10Dataset(Dataset): - def __init__(self, args): - super(CIFAR10Dataset, self).__init__(args) +class CIFAR10Dataset(Dataset): - def load_train_dataset(self): - self.get_args().get_logger().debug("Loading CIFAR10 train data") + def __init__(self, config, learning_param, rank: int = 0, world_size: int = None): + super(CIFAR10Dataset, self).__init__(config, learning_param, rank, world_size) + def load_train_dataset(self, rank: int = 0, world_size: int = None): normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform = transforms.Compose([ transforms.RandomHorizontalFlip(), @@ -20,32 +19,22 @@ def load_train_dataset(self): normalize ]) - train_dataset = datasets.CIFAR10(root=self.get_args().get_data_path(), train=True, download=True, transform=transform) - sampler = DistributedSampler(train_dataset, rank=self.args.get_rank(), num_replicas=self.args.get_world_size()) if self.args.get_distributed() else None - train_loader = DataLoader(train_dataset, batch_size=len(train_dataset), sampler=sampler) - self.args.set_sampler(sampler) + train_dataset = datasets.CIFAR10(root=self.config.get_data_path(), train=True, download=True, transform=transform) + sampler = DistributedSampler(train_dataset, rank=rank, num_replicas=self.world_size) if self.world_size else None + train_loader = DataLoader(train_dataset, batch_size=self.learning_params.batch_size, sampler=sampler, + shuffle=(sampler is None)) - train_data = self.get_tuple_from_data_loader(train_loader) - - self.get_args().get_logger().debug("Finished loading CIFAR10 train data") - - return train_data + return train_loader def load_test_dataset(self): - self.get_args().get_logger().debug("Loading CIFAR10 test data") - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform = transforms.Compose([ transforms.ToTensor(), normalize ]) - test_dataset = datasets.CIFAR10(root=self.get_args().get_data_path(), train=False, download=True, transform=transform) - sampler = DistributedSampler(test_dataset, rank=self.args.get_rank(), num_replicas=self.args.get_world_size()) if self.args.get_distributed() else None - test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), sampler=sampler) - self.args.set_sampler(sampler) - - test_data = self.get_tuple_from_data_loader(test_loader) - - self.get_args().get_logger().debug("Finished loading CIFAR10 test data") - - return test_data + test_dataset = datasets.CIFAR10(root=self.config.get_data_path(), train=False, download=True, + transform=transform) + sampler = DistributedSampler(test_dataset, rank=self.rank, + num_replicas=self.world_size) if self.world_size else None + test_loader = DataLoader(test_dataset, batch_size=self.learning_params.batch_size, sampler=sampler) + return test_loader diff --git a/fltk/datasets/cifar100.py b/fltk/datasets/cifar100.py index 186a98dc..253c6040 100644 --- a/fltk/datasets/cifar100.py +++ b/fltk/datasets/cifar100.py @@ -1,45 +1,37 @@ -from .dataset import Dataset +from torch.utils.data import DataLoader, DistributedSampler from torchvision import datasets from torchvision import transforms -from torch.utils.data import DataLoader -class CIFAR100Dataset(Dataset): +from .dataset import Dataset - def __init__(self, args): - super(CIFAR100Dataset, self).__init__(args) - def load_train_dataset(self): - self.get_args().get_logger().debug("Loading CIFAR100 train data") +class CIFAR100Dataset(Dataset): - normalize = transforms.Normalize(mean=[0.507, 0.487, 0.441], std=[0.267, 0.256, 0.276]) - transform = transforms.Compose([ - transforms.RandomHorizontalFlip(), - transforms.RandomCrop(32, 4), - transforms.ToTensor(), - normalize - ]) - train_dataset = datasets.CIFAR100(root=self.get_args().get_data_path(), train=True, download=True, transform=transform) - train_loader = DataLoader(train_dataset, batch_size=len(train_dataset)) + DEFAULT_TRANSFORM = transforms.Compose([ + transforms.RandomHorizontalFlip(), + transforms.RandomCrop(32, 4), + transforms.ToTensor(), + transforms.Normalize(mean=[0.507, 0.487, 0.441], std=[0.267, 0.256, 0.276]) + ]) - train_data = self.get_tuple_from_data_loader(train_loader) + def __init__(self, config, learning_param, rank: int = 0, world_size: int = None): + super(CIFAR100Dataset, self).__init__(config, learning_param, rank, world_size) - self.get_args().get_logger().debug("Finished loading CIFAR100 train data") + def load_train_dataset(self, rank: int = 0, world_size: int = None): + train_dataset = datasets.CIFAR100(root=self.config.get_data_path(), train=True, download=True, + transform=self.DEFAULT_TRANSFORM) + sampler = DistributedSampler(train_dataset, rank=rank, + num_replicas=self.world_size) if self.world_size else None + train_loader = DataLoader(train_dataset, batch_size=self.learning_params.batch_size, sampler=sampler, + shuffle=(sampler is None)) - return train_data + return train_loader def load_test_dataset(self): - self.get_args().get_logger().debug("Loading CIFAR100 test data") - - normalize = transforms.Normalize(mean=[0.507, 0.487, 0.441], std=[0.267, 0.256, 0.276]) - transform = transforms.Compose([ - transforms.ToTensor(), - normalize - ]) - test_dataset = datasets.CIFAR100(root=self.get_args().get_data_path(), train=False, download=True, transform=transform) - test_loader = DataLoader(test_dataset, batch_size=len(test_dataset)) - - test_data = self.get_tuple_from_data_loader(test_loader) - - self.get_args().get_logger().debug("Finished loading CIFAR100 test data") - return test_data + test_dataset = datasets.CIFAR100(root=self.config.get_data_path(), train=False, download=True, + transform=self.DEFAULT_TRANSFORM) + sampler = DistributedSampler(test_dataset, rank=self.rank, + num_replicas=self.world_size) if self.world_size else None + test_loader = DataLoader(test_dataset, batch_size=self.learning_params.batch_size, sampler=sampler) + return test_loader diff --git a/fltk/datasets/data_distribution/__init__.py b/fltk/datasets/data_distribution/__init__.py deleted file mode 100644 index ab7c6369..00000000 --- a/fltk/datasets/data_distribution/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .iid_equal import distribute_batches_equally diff --git a/fltk/datasets/data_distribution/iid_equal.py b/fltk/datasets/data_distribution/iid_equal.py deleted file mode 100644 index c47bcc16..00000000 --- a/fltk/datasets/data_distribution/iid_equal.py +++ /dev/null @@ -1,19 +0,0 @@ -import torch - -def distribute_batches_equally(train_data_loader, num_workers): - """ - Gives each worker the same number of batches of training data. - - :param train_data_loader: Training data loader - :type train_data_loader: torch.utils.data.DataLoader - :param num_workers: number of workers - :type num_workers: int - """ - distributed_dataset = [[] for i in range(num_workers)] - - for batch_idx, (data, target) in enumerate(train_data_loader): - worker_idx = batch_idx % num_workers - - distributed_dataset[worker_idx].append((data, target)) - - return distributed_dataset diff --git a/fltk/datasets/dataset.py b/fltk/datasets/dataset.py index 17e6a4c6..9e7f4489 100644 --- a/fltk/datasets/dataset.py +++ b/fltk/datasets/dataset.py @@ -1,113 +1,97 @@ from abc import abstractmethod + +import torch from torch.utils.data import DataLoader from torch.utils.data import TensorDataset -import torch -import numpy - -from fltk.util.arguments import Arguments class Dataset: - def __init__(self, args: Arguments): - self.args = args - self.train_dataset = self.load_train_dataset() - self.test_dataset = self.load_test_dataset() - - def get_args(self): - """ - Returns the arguments. - - :return: Arguments - """ - return self.args - - def get_train_dataset(self): - """ - Returns the train dataset. - - :return: tuple - """ - return self.train_dataset - - def get_test_dataset(self): - """ - Returns the test dataset. - - :return: tuple - """ - return self.test_dataset - - @abstractmethod - def load_train_dataset(self): - """ - Loads & returns the training dataset. - - :return: tuple - """ - raise NotImplementedError("load_train_dataset() isn't implemented") - - @abstractmethod - def load_test_dataset(self): - """ - Loads & returns the test dataset. - - :return: tuple - """ - raise NotImplementedError("load_test_dataset() isn't implemented") - - def get_train_loader(self, batch_size, **kwargs): - """ - Return the data loader for the train dataset. - - :param batch_size: batch size of data loader - :type batch_size: int - :return: torch.utils.data.DataLoader - """ - return Dataset.get_data_loader_from_data(batch_size, self.train_dataset[0], self.train_dataset[1], **kwargs) - - def get_test_loader(self, batch_size, **kwargs): - """ - Return the data loader for the test dataset. - - :param batch_size: batch size of data loader - :type batch_size: int - :return: torch.utils.data.DataLoader - """ - return Dataset.get_data_loader_from_data(batch_size, self.test_dataset[0], self.test_dataset[1], **kwargs) - - @staticmethod - def get_data_loader_from_data(batch_size, X, Y, **kwargs): - """ - Get a data loader created from a given set of data. - - :param batch_size: batch size of data loader - :type batch_size: int - :param X: data features - :type X: numpy.Array() - :param Y: data labels - :type Y: numpy.Array() - :return: torch.utils.data.DataLoader - """ - X_torch = torch.from_numpy(X).float() - - if "classification_problem" in kwargs and kwargs["classification_problem"] == False: - Y_torch = torch.from_numpy(Y).float() - else: - Y_torch = torch.from_numpy(Y).long() - dataset = TensorDataset(X_torch, Y_torch) - - kwargs.pop("classification_problem", None) - - return DataLoader(dataset, batch_size=batch_size, **kwargs) - - @staticmethod - def get_tuple_from_data_loader(data_loader): - """ - Get a tuple representation of the data stored in a data loader. - - :param data_loader: data loader to get data from - :type data_loader: torch.utils.data.DataLoader - :return: tuple - """ - return (next(iter(data_loader))[0].numpy(), next(iter(data_loader))[1].numpy()) + def __init__(self, config, learning_params, rank: int, world_size: int): + self.config = config + self.learning_params = learning_params + + self.rank = rank + self.world_size = world_size + + self.train_loader = self.load_train_dataset() + self.test_loader = self.load_test_dataset() + + def get_train_dataset(self): + """ + Returns the train dataset. + + :return: tuple + """ + return self.train_loader + + def get_test_dataset(self): + """ + Returns the test dataset. + + :return: tuple + """ + return self.test_loader + + @abstractmethod + def load_train_dataset(self): + """ + Loads & returns the training dataset. + + :return: tuple + """ + raise NotImplementedError("load_train_dataset() isn't implemented") + + @abstractmethod + def load_test_dataset(self): + """ + Loads & returns the test dataset. + + :return: tuple + """ + raise NotImplementedError("load_test_dataset() isn't implemented") + + def get_train_loader(self, **kwargs): + """ + Return the data loader for the train dataset. + + :param batch_size: batch size of data loader + :type batch_size: int + :return: torch.utils.data.DataLoader + """ + return self.train_loader + + def get_test_loader(self, **kwargs): + """ + Return the data loader for the test dataset. + + :param batch_size: batch size of data loader + :type batch_size: int + :return: torch.utils.data.DataLoader + """ + return self.test_loader + + @staticmethod + def get_data_loader_from_data(batch_size, X, Y, **kwargs): + """ + Get a data loader created from a given set of data. + + :param batch_size: batch size of data loader + :type batch_size: int + :param X: data features + :type X: numpy.Array() + :param Y: data labels + :type Y: numpy.Array() + :return: torch.utils.data.DataLoader + """ + X_torch = torch.from_numpy(X).float() + + if "classification_problem" in kwargs and kwargs["classification_problem"] == False: + Y_torch = torch.from_numpy(Y).float() + else: + Y_torch = torch.from_numpy(Y).long() + dataset = TensorDataset(X_torch, Y_torch) + + kwargs.pop("classification_problem", None) + + return DataLoader(dataset, batch_size=batch_size, **kwargs) diff --git a/fltk/datasets/distributed/__init__.py b/fltk/datasets/distributed/__init__.py deleted file mode 100644 index 8175430d..00000000 --- a/fltk/datasets/distributed/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from .dataset import DistDataset -from .cifar10 import DistCIFAR10Dataset -from .cifar100 import DistCIFAR100Dataset -from .fashion_mnist import DistFashionMNISTDataset diff --git a/fltk/datasets/distributed/cifar10.py b/fltk/datasets/distributed/cifar10.py deleted file mode 100644 index f25feeca..00000000 --- a/fltk/datasets/distributed/cifar10.py +++ /dev/null @@ -1,44 +0,0 @@ -from torchvision import datasets -from torchvision import transforms -from torch.utils.data import DataLoader, DistributedSampler - -from fltk.datasets.distributed.dataset import DistDataset -from fltk.strategy.data_samplers import get_sampler -import logging - - -class DistCIFAR10Dataset(DistDataset): - - def __init__(self, args): - super(DistCIFAR10Dataset, self).__init__(args) - self.init_train_dataset() - self.init_test_dataset() - - def init_train_dataset(self): - dist_loader_text = "distributed" if self.args.get_distributed() else "" - self.get_args().get_logger().debug(f"Loading '{dist_loader_text}' CIFAR10 train data") - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - transform = transforms.Compose([ - transforms.RandomHorizontalFlip(), - transforms.RandomCrop(32, 4), - transforms.ToTensor(), - normalize - ]) - self.train_dataset = datasets.CIFAR10(root=self.get_args().get_data_path(), train=True, download=True, - transform=transform) - self.train_sampler = get_sampler(self.train_dataset, self.args) - self.train_loader = DataLoader(self.train_dataset, batch_size=16, sampler=self.train_sampler) - logging.info("this client gets {} samples".format(len(self.train_sampler))) - - def init_test_dataset(self): - self.get_args().get_logger().debug("Loading CIFAR10 test data") - - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - transform = transforms.Compose([ - transforms.ToTensor(), - normalize - ]) - self.test_dataset = datasets.CIFAR10(root=self.get_args().get_data_path(), train=False, download=True, - transform=transform) - self.test_sampler = get_sampler(self.test_dataset, self.args) - self.test_loader = DataLoader(self.test_dataset, batch_size=16, sampler=self.test_sampler) diff --git a/fltk/datasets/distributed/cifar100.py b/fltk/datasets/distributed/cifar100.py deleted file mode 100644 index 329b7ae0..00000000 --- a/fltk/datasets/distributed/cifar100.py +++ /dev/null @@ -1,90 +0,0 @@ -from torchvision import datasets -from torchvision import transforms -from torch.utils.data import DataLoader, DistributedSampler - -from fltk.datasets.distributed.dataset import DistDataset -from fltk.strategy.data_samplers import get_sampler - - -class DistCIFAR100Dataset(DistDataset): - - def __init__(self, args): - super(DistCIFAR100Dataset, self).__init__(args) - self.init_train_dataset() - self.init_test_dataset() - - def init_train_dataset(self): - dist_loader_text = "distributed" if self.args.get_distributed() else "" - self.get_args().get_logger().debug(f"Loading '{dist_loader_text}' CIFAR100 train data") - normalize = transforms.Normalize(mean=[0.507, 0.487, 0.441], std=[0.267, 0.256, 0.276]) - transform = transforms.Compose([ - transforms.RandomHorizontalFlip(), - transforms.RandomCrop(32, 4), - transforms.ToTensor(), - normalize - ]) - self.train_dataset = datasets.CIFAR100(root=self.get_args().get_data_path(), train=True, download=True, - transform=transform) - self.train_sampler = get_sampler(self.train_dataset, self.args) - self.train_loader = DataLoader(self.train_dataset, batch_size=16, sampler=self.train_sampler) - - def init_test_dataset(self): - dist_loader_text = "distributed" if self.args.get_distributed() else "" - self.get_args().get_logger().debug(f"Loading '{dist_loader_text}' CIFAR100 test data") - - normalize = transforms.Normalize(mean=[0.507, 0.487, 0.441], std=[0.267, 0.256, 0.276]) - transform = transforms.Compose([ - transforms.ToTensor(), - normalize - ]) - self.test_dataset = datasets.CIFAR100(root=self.get_args().get_data_path(), train=False, download=True, - transform=transform) - self.test_sampler = get_sampler(self.test_dataset, self.args) - self.test_loader = DataLoader(self.test_dataset, batch_size=16, sampler=self.test_sampler) - - - def load_train_dataset(self): - dist_loader_text = "distributed" if self.args.get_distributed() else "" - self.get_args().get_logger().debug(f"Loading '{dist_loader_text}' CIFAR100 train data") - - normalize = transforms.Normalize(mean=[0.507, 0.487, 0.441], std=[0.267, 0.256, 0.276]) - transform = transforms.Compose([ - transforms.RandomHorizontalFlip(), - transforms.RandomCrop(32, 4), - transforms.ToTensor(), - normalize - ]) - - train_dataset = datasets.CIFAR100(root=self.get_args().get_data_path(), train=True, download=True, - transform=transform) - sampler = get_sampler(self.test_dataset, self.args) - - train_loader = DataLoader(train_dataset, batch_size=len(train_dataset), sampler=sampler) - self.args.set_sampler(sampler) - - train_data = self.get_tuple_from_data_loader(train_loader) - dist_loader_text = "distributed" if self.args.get_distributed() else "" - self.get_args().get_logger().debug(f"Finished loading '{dist_loader_text}' CIFAR100 train data") - - return train_data - - def load_test_dataset(self): - self.get_args().get_logger().debug("Loading CIFAR100 test data") - - normalize = transforms.Normalize(mean=[0.507, 0.487, 0.441], std=[0.267, 0.256, 0.276]) - transform = transforms.Compose([ - transforms.ToTensor(), - normalize - ]) - test_dataset = datasets.CIFAR100(root=self.get_args().get_data_path(), train=False, download=True, - transform=transform) - sampler = get_sampler(self.test_dataset, self.args) - test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), sampler=sampler) - self.args.set_sampler(sampler) - - test_data = self.get_tuple_from_data_loader(test_loader) - - self.get_args().get_logger().debug("Finished loading CIFAR10 test data") - - return test_data - diff --git a/fltk/datasets/distributed/dataset.py b/fltk/datasets/distributed/dataset.py deleted file mode 100644 index 46458de1..00000000 --- a/fltk/datasets/distributed/dataset.py +++ /dev/null @@ -1,139 +0,0 @@ -from abc import abstractmethod -from torch.utils.data import DataLoader -from torch.utils.data import TensorDataset -import torch -import numpy - -from fltk.util.arguments import Arguments - - -class DistDataset: - - train_sampler = None - test_sampler = None - train_dataset = None - test_dataset = None - train_loader = None - test_loader = None - def __init__(self, args: Arguments): - self.args = args - # self.train_dataset = self.load_train_dataset() - # self.test_dataset = self.load_test_dataset() - - def get_args(self): - """ - Returns the arguments. - - :return: Arguments - """ - return self.args - - # def get_train_dataset(self): - # """ - # Returns the train dataset. - # - # :return: tuple - # """ - # return self.train_dataset - # - # def get_test_dataset(self): - # """ - # Returns the test dataset. - # - # :return: tuple - # """ - # return self.test_dataset - - def get_train_loader(self): - return self.train_loader - - def get_test_loader(self): - return self.test_loader - - def get_train_sampler(self): - return self.train_sampler - - def get_test_sampler(self): - return self.test_sampler - - @abstractmethod - def init_train_dataset(self): - raise NotImplementedError("load_train_dataset() isn't implemented") - - @abstractmethod - def init_test_dataset(self): - raise NotImplementedError("load_train_dataset() isn't implemented") - - # @abstractmethod - # def load_train_dataset(self): - # """ - # Loads & returns the training dataset. - # - # :return: tuple - # """ - # raise NotImplementedError("load_train_dataset() isn't implemented") - # - # @abstractmethod - # def load_test_dataset(self): - # """ - # Loads & returns the test dataset. - # - # :return: tuple - # """ - # raise NotImplementedError("load_test_dataset() isn't implemented") - - # def get_train_loader(self, batch_size, **kwargs): - # """ - # Return the data loader for the train dataset. - # - # :param batch_size: batch size of data loader - # :type batch_size: int - # :return: torch.utils.data.DataLoader - # """ - # return Dataset.get_data_loader_from_data(batch_size, self.train_dataset[0], self.train_dataset[1], **kwargs) - # - # def get_test_loader(self, batch_size, **kwargs): - # """ - # Return the data loader for the test dataset. - # - # :param batch_size: batch size of data loader - # :type batch_size: int - # :return: torch.utils.data.DataLoader - # """ - # return Dataset.get_data_loader_from_data(batch_size, self.test_dataset[0], self.test_dataset[1], **kwargs) - # - # @staticmethod - # def get_data_loader_from_data(batch_size, X, Y, **kwargs): - # """ - # Get a data loader created from a given set of data. - # - # :param batch_size: batch size of data loader - # :type batch_size: int - # :param X: data features - # :type X: numpy.Array() - # :param Y: data labels - # :type Y: numpy.Array() - # :return: torch.utils.data.DataLoader - # """ - # X_torch = torch.from_numpy(X).float() - # - # if "classification_problem" in kwargs and kwargs["classification_problem"] == False: - # Y_torch = torch.from_numpy(Y).float() - # else: - # Y_torch = torch.from_numpy(Y).long() - # dataset = TensorDataset(X_torch, Y_torch) - # - # kwargs.pop("classification_problem", None) - # - # return DataLoader(dataset, batch_size=batch_size, **kwargs) - # - # @staticmethod - # def get_tuple_from_data_loader(data_loader): - # """ - # Get a tuple representation of the data stored in a data loader. - # - # :param data_loader: data loader to get data from - # :type data_loader: torch.utils.data.DataLoader - # :return: tuple - # """ - # return (next(iter(data_loader))[0].numpy(), next(iter(data_loader))[1].numpy()) diff --git a/fltk/datasets/distributed/fashion_mnist.py b/fltk/datasets/distributed/fashion_mnist.py deleted file mode 100644 index cba0468b..00000000 --- a/fltk/datasets/distributed/fashion_mnist.py +++ /dev/null @@ -1,55 +0,0 @@ -from fltk.datasets.distributed import DistDataset -from torchvision import datasets -from torchvision import transforms -from torch.utils.data import DataLoader, DistributedSampler - -from fltk.strategy.data_samplers import get_sampler - - -class DistFashionMNISTDataset(DistDataset): - - def __init__(self, args): - super(DistFashionMNISTDataset, self).__init__(args) - self.init_train_dataset() - self.init_test_dataset() - - def init_train_dataset(self): - dist_loader_text = "distributed" if self.args.get_distributed() else "" - self.get_args().get_logger().debug(f"Loading '{dist_loader_text}' Fashion MNIST train data") - - self.train_dataset = datasets.FashionMNIST(root=self.get_args().get_data_path(), train=True, download=True, - transform=transforms.Compose([transforms.ToTensor()])) - self.train_sampler = get_sampler(self.train_dataset, self.args) - self.train_loader = DataLoader(self.train_dataset, batch_size=16, sampler=self.train_sampler) - - def init_test_dataset(self): - dist_loader_text = "distributed" if self.args.get_distributed() else "" - self.get_args().get_logger().debug(f"Loading '{dist_loader_text}' Fashion MNIST test data") - self.test_dataset = datasets.FashionMNIST(root=self.get_args().get_data_path(), train=False, download=True, - transform=transforms.Compose([transforms.ToTensor()])) - self.test_sampler = get_sampler(self.test_dataset, self.args) - self.test_loader = DataLoader(self.test_dataset, batch_size=16, sampler=self.test_sampler) - - def load_train_dataset(self): - self.get_args().get_logger().debug("Loading Fashion MNIST train data") - - train_dataset = datasets.FashionMNIST(self.get_args().get_data_path(), train=True, download=True, transform=transforms.Compose([transforms.ToTensor()])) - train_loader = DataLoader(train_dataset, batch_size=len(train_dataset)) - - train_data = self.get_tuple_from_data_loader(train_loader) - - self.get_args().get_logger().debug("Finished loading Fashion MNIST train data") - - return train_data - - def load_test_dataset(self): - self.get_args().get_logger().debug("Loading Fashion MNIST test data") - - test_dataset = datasets.FashionMNIST(self.get_args().get_data_path(), train=False, download=True, transform=transforms.Compose([transforms.ToTensor()])) - test_loader = DataLoader(test_dataset, batch_size=len(test_dataset)) - - test_data = self.get_tuple_from_data_loader(test_loader) - - self.get_args().get_logger().debug("Finished loading Fashion MNIST test data") - - return test_data diff --git a/fltk/datasets/fashion_mnist.py b/fltk/datasets/fashion_mnist.py index 0f851cfa..19d196a4 100644 --- a/fltk/datasets/fashion_mnist.py +++ b/fltk/datasets/fashion_mnist.py @@ -1,33 +1,28 @@ from .dataset import Dataset from torchvision import datasets from torchvision import transforms -from torch.utils.data import DataLoader +from torch.utils.data import DataLoader, DistributedSampler -class FashionMNISTDataset(Dataset): - - def __init__(self, args): - super(FashionMNISTDataset, self).__init__(args) - def load_train_dataset(self): - self.get_args().get_logger().debug("Loading Fashion MNIST train data") - - train_dataset = datasets.FashionMNIST(self.get_args().get_data_path(), train=True, download=True, transform=transforms.Compose([transforms.ToTensor()])) - train_loader = DataLoader(train_dataset, batch_size=len(train_dataset)) +class FashionMNISTDataset(Dataset): - train_data = self.get_tuple_from_data_loader(train_loader) + def __init__(self, config, learning_param, rank: int = 0, world_size: int = None): + super(FashionMNISTDataset, self).__init__(config, learning_param, rank, world_size) - self.get_args().get_logger().debug("Finished loading Fashion MNIST train data") + def load_train_dataset(self, rank: int = 0, world_size: int = None): + train_dataset = datasets.FashionMNIST(root=self.config.get_data_path(), train=True, download=True, + transform=transforms.Compose([transforms.ToTensor()])) + sampler = DistributedSampler(train_dataset, rank=rank, + num_replicas=self.world_size) if self.world_size else None + train_loader = DataLoader(train_dataset, batch_size=self.learning_params.batch_size, sampler=sampler, + shuffle=(sampler is None)) - return train_data + return train_loader def load_test_dataset(self): - self.get_args().get_logger().debug("Loading Fashion MNIST test data") - - test_dataset = datasets.FashionMNIST(self.get_args().get_data_path(), train=False, download=True, transform=transforms.Compose([transforms.ToTensor()])) - test_loader = DataLoader(test_dataset, batch_size=len(test_dataset)) - - test_data = self.get_tuple_from_data_loader(test_loader) - - self.get_args().get_logger().debug("Finished loading Fashion MNIST test data") - - return test_data + test_dataset = datasets.FashionMNIST(root=self.config.get_data_path(), train=False, download=True, + transform=transforms.Compose([transforms.ToTensor()])) + sampler = DistributedSampler(test_dataset, rank=self.rank, + num_replicas=self.world_size) if self.world_size else None + test_loader = DataLoader(test_dataset, batch_size=self.learning_params.batch_size, sampler=sampler) + return test_loader diff --git a/fltk/datasets/mnist.py b/fltk/datasets/mnist.py new file mode 100644 index 00000000..426b5fe6 --- /dev/null +++ b/fltk/datasets/mnist.py @@ -0,0 +1,33 @@ +from torch.utils.data import DataLoader, DistributedSampler +from torchvision import datasets +from torchvision import transforms + +from .dataset import Dataset + + +class MNIST(Dataset): + DEFAULT_TRANSFORM = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ]) + + def __init__(self, config, learning_param, rank: int = 0, world_size: int = None): + super(MNIST, self).__init__(config, learning_param, rank, world_size) + + def load_train_dataset(self, rank: int = 0, world_size: int = None): + train_dataset = datasets.FashionMNIST(root=self.config.get_data_path(), train=True, download=True, + transform=self.DEFAULT_TRANSFORM) + sampler = DistributedSampler(train_dataset, rank=rank, + num_replicas=self.world_size) if self.world_size else None + train_loader = DataLoader(train_dataset, batch_size=self.learning_params.batch_size, sampler=sampler, + shuffle=(sampler is None)) + + return train_loader + + def load_test_dataset(self): + test_dataset = datasets.FashionMNIST(root=self.config.get_data_path(), train=False, download=True, + transform=self.DEFAULT_TRANSFORM) + sampler = DistributedSampler(test_dataset, rank=self.rank, + num_replicas=self.world_size) if self.world_size else None + test_loader = DataLoader(test_dataset, batch_size=self.learning_params.batch_size, sampler=sampler) + return test_loader diff --git a/fltk/extractor.py b/fltk/extractor.py new file mode 100644 index 00000000..a0303b94 --- /dev/null +++ b/fltk/extractor.py @@ -0,0 +1,35 @@ +import os +from argparse import Namespace + +from torchvision.datasets import FashionMNIST, CIFAR10, CIFAR100, MNIST + +from fltk.util.config import BareConfig + + +def download_datasets(args: Namespace, config: BareConfig): + """ + Function to Download datasets to a system. This is currently meant to be run (using the extractor mode of FLTK) to + download all datasets into the `data` directory and include it in the Docker image that is build for the project. + (This to prevent unnecessary load on the services that provide the datasets, and decrease the energy footprint of + using the FLTK framework). + @param args: Namespace object. + @type args: Namespace + @param config: FLTK configuration file, for finding the path where the datasets should be stored. + @type config: BareConfig + @return: None + @rtype: None + """ + data_path = config.get_data_path() + root = str(data_path) + + if not data_path.is_dir(): + os.mkdirs(root, exist_ok=True) + + # Prepare MNIST + MNIST(root=root, download=True) + # Prepare Fashion MNIST + FashionMNIST(root=root, download=True) + # Prepare CIFAR10 + CIFAR10(root=root, download=True) + # Prepare CIFAR100 + CIFAR100(root=root, download=True) diff --git a/fltk/federator.py b/fltk/federator.py deleted file mode 100644 index 88ccf31d..00000000 --- a/fltk/federator.py +++ /dev/null @@ -1,244 +0,0 @@ -import datetime -import time -from typing import List - -from dataclass_csv import DataclassWriter -from torch.distributed import rpc - -from fltk.client import Client -from fltk.datasets.data_distribution import distribute_batches_equally -from fltk.strategy.client_selection import random_selection -from fltk.util.arguments import Arguments -from fltk.util.base_config import BareConfig -from fltk.util.data_loader_utils import load_train_data_loader, load_test_data_loader, \ - generate_data_loaders_from_distributed_dataset -from fltk.util.fed_avg import average_nn_parameters -from fltk.util.log import FLLogger -from torchsummary import summary -from torch.utils.tensorboard import SummaryWriter -from pathlib import Path -import logging - -from fltk.util.results import EpochData -from fltk.util.tensor_converter import convert_distributed_data_into_numpy - -logging.basicConfig(level=logging.DEBUG) - -def _call_method(method, rref, *args, **kwargs): - return method(rref.local_value(), *args, **kwargs) - - -def _remote_method(method, rref, *args, **kwargs): - args = [method, rref] + list(args) - return rpc.rpc_sync(rref.owner(), _call_method, args=args, kwargs=kwargs) - -def _remote_method_async(method, rref, *args, **kwargs): - args = [method, rref] + list(args) - return rpc.rpc_async(rref.owner(), _call_method, args=args, kwargs=kwargs) - -class ClientRef: - ref = None - name = "" - data_size = 0 - tb_writer = None - - def __init__(self, name, ref, tensorboard_writer): - self.name = name - self.ref = ref - self.tb_writer = tensorboard_writer - - def __repr__(self): - return self.name - -class Federator: - """ - Central component of the Federated Learning System: The Federator - - The Federator is in charge of the following tasks: - - Have a copy of the global model - - Client selection - - Aggregating the client model weights/gradients - - Saving all the metrics - - Use tensorboard to report metrics - - Keep track of timing - - """ - clients: List[ClientRef] = [] - epoch_counter = 0 - client_data = {} - - def __init__(self, client_id_triple, num_epochs = 3, config=None): - log_rref = rpc.RRef(FLLogger()) - self.log_rref = log_rref - self.num_epoch = num_epochs - self.config = config - self.tb_path = config.output_location - self.ensure_path_exists(self.tb_path) - self.tb_writer = SummaryWriter(f'{self.tb_path}/{config.experiment_prefix}_federator') - self.create_clients(client_id_triple) - self.config.init_logger(logging) - - logging.info("Creating test client") - copy_sampler = config.data_sampler - config.data_sampler = "uniform" - self.test_data = Client("test", None, 1, 2, config) - self.test_data.init_dataloader() - config.data_sampler = copy_sampler - - - def create_clients(self, client_id_triple): - for id, rank, world_size in client_id_triple: - client = rpc.remote(id, Client, kwargs=dict(id=id, log_rref=self.log_rref, rank=rank, world_size=world_size, config=self.config)) - writer = SummaryWriter(f'{self.tb_path}/{self.config.experiment_prefix}_client_{id}') - self.clients.append(ClientRef(id, client, tensorboard_writer=writer)) - self.client_data[id] = [] - - def select_clients(self, n = 2): - return random_selection(self.clients, n) - - def ping_all(self): - for client in self.clients: - logging.info(f'Sending ping to {client}') - t_start = time.time() - answer = _remote_method(Client.ping, client.ref) - t_end = time.time() - duration = (t_end - t_start)*1000 - logging.info(f'Ping to {client} is {duration:.3}ms') - - def rpc_test_all(self): - for client in self.clients: - res = _remote_method_async(Client.rpc_test, client.ref) - while not res.done(): - pass - - def client_load_data(self): - for client in self.clients: - _remote_method_async(Client.init_dataloader, client.ref) - - def clients_ready(self): - all_ready = False - ready_clients = [] - while not all_ready: - responses = [] - for client in self.clients: - if client.name not in ready_clients: - responses.append((client, _remote_method_async(Client.is_ready, client.ref))) - all_ready = True - for res in responses: - result = res[1].wait() - if result: - logging.info(f'{res[0]} is ready') - ready_clients.append(res[0]) - else: - logging.info(f'Waiting for {res[0]}') - all_ready = False - - time.sleep(2) - logging.info('All clients are ready') - - def remote_run_epoch(self, epochs): - responses = [] - client_weights = [] - selected_clients = self.select_clients(self.config.clients_per_round) - for client in selected_clients: - responses.append((client, _remote_method_async(Client.run_epochs, client.ref, num_epoch=epochs))) - self.epoch_counter += epochs - for res in responses: - epoch_data, weights = res[1].wait() - self.client_data[epoch_data.client_id].append(epoch_data) - logging.info(f'{res[0]} had a loss of {epoch_data.loss}') - logging.info(f'{res[0]} had a epoch data of {epoch_data}') - - res[0].tb_writer.add_scalar('training loss', - epoch_data.loss_train, # for every 1000 minibatches - self.epoch_counter * res[0].data_size) - - res[0].tb_writer.add_scalar('accuracy', - epoch_data.accuracy, # for every 1000 minibatches - self.epoch_counter * res[0].data_size) - - res[0].tb_writer.add_scalar('training loss per epoch', - epoch_data.loss_train, # for every 1000 minibatches - self.epoch_counter) - - res[0].tb_writer.add_scalar('accuracy per epoch', - epoch_data.accuracy, # for every 1000 minibatches - self.epoch_counter) - - client_weights.append(weights) - updated_model = average_nn_parameters(client_weights) - - # test global model - logging.info("Testing on global test set") - self.test_data.update_nn_parameters(updated_model) - accuracy, loss, class_precision, class_recall = self.test_data.test() - # self.tb_writer.add_scalar('training loss', loss, self.epoch_counter * self.test_data.get_client_datasize()) # does not seem to work :( ) - self.tb_writer.add_scalar('accuracy', accuracy, self.epoch_counter * self.test_data.get_client_datasize()) - self.tb_writer.add_scalar('accuracy per epoch', accuracy, self.epoch_counter) - - - responses = [] - for client in self.clients: - responses.append( - (client, _remote_method_async(Client.update_nn_parameters, client.ref, new_params=updated_model))) - - for res in responses: - res[1].wait() - logging.info('Weights are updated') - - def update_client_data_sizes(self): - responses = [] - for client in self.clients: - responses.append((client, _remote_method_async(Client.get_client_datasize, client.ref))) - for res in responses: - res[0].data_size = res[1].wait() - logging.info(f'{res[0]} had a result of datasize={res[0].data_size}') - - def remote_test_sync(self): - responses = [] - for client in self.clients: - responses.append((client, _remote_method_async(Client.test, client.ref))) - - for res in responses: - accuracy, loss, class_precision, class_recall = res[1].wait() - logging.info(f'{res[0]} had a result of accuracy={accuracy}') - - def save_epoch_data(self): - file_output = f'./{self.config.output_location}' - self.ensure_path_exists(file_output) - for key in self.client_data: - filename = f'{file_output}/{key}_epochs.csv' - logging.info(f'Saving data at {filename}') - with open(filename, "w") as f: - w = DataclassWriter(f, self.client_data[key], EpochData) - w.write() - - def ensure_path_exists(self, path): - Path(path).mkdir(parents=True, exist_ok=True) - - def run(self): - """ - Main loop of the Federator - :return: - """ - # # Make sure the clients have loaded all the data - self.client_load_data() - self.ping_all() - self.clients_ready() - self.update_client_data_sizes() - - epoch_to_run = self.num_epoch - addition = 0 - epoch_to_run = self.config.epochs - epoch_size = self.config.epochs_per_cycle - for epoch in range(epoch_to_run): - print(f'Running epoch {epoch}') - self.remote_run_epoch(epoch_size) - addition += 1 - logging.info('Printing client data') - print(self.client_data) - - logging.info(f'Saving data') - self.save_epoch_data() - logging.info(f'Federator is stopping') - diff --git a/fltk/launch.py b/fltk/launch.py index d0e49904..bef1791d 100644 --- a/fltk/launch.py +++ b/fltk/launch.py @@ -1,72 +1,111 @@ -import os -import sys -import torch.distributed.rpc as rpc import logging +import os +from argparse import Namespace +from multiprocessing.pool import ThreadPool -import yaml -import argparse +import torch.distributed as dist +from kubernetes import config -import torch.multiprocessing as mp -from fltk.federator import Federator -from fltk.util.base_config import BareConfig +from fltk.client import Client +from fltk.extractor import download_datasets +from fltk.orchestrator import Orchestrator +from fltk.util.cluster.client import ClusterManager +from fltk.util.config.arguments import LearningParameters +from fltk.util.config.base_config import BareConfig +from fltk.util.task.generator.arrival_generator import ExperimentGenerator -logging.basicConfig(level=logging.DEBUG) +def should_distribute() -> bool: + """ + Function to check whether distributed execution is needed. -def run_ps(rpc_ids_triple, args): - print(f'Starting the federator...') - fed = Federator(rpc_ids_triple, config=args) - fed.run() + Note: the WORLD_SIZE environmental variable needs to be set for this to work (larger than 1). + PytorchJobs launched from KubeFlow automatically set this property. + @return: Indicator for distributed execution. + @rtype: bool + """ + world_size = int(os.environ.get('WORLD_SIZE', 1)) + return dist.is_available() and world_size > 1 -def run_single(rank, world_size, host = None, args = None, nic = None): - logging.info(f'Starting with rank={rank} and world size={world_size}') - if host: - os.environ['MASTER_ADDR'] = host - else: - os.environ['MASTER_ADDR'] = '0.0.0.0' - os.environ['MASTER_PORT'] = '5000' - if nic: - os.environ['GLOO_SOCKET_IFNAME'] = nic - os.environ['TP_SOCKET_IFNAME'] = nic - else: - os.environ['GLOO_SOCKET_IFNAME'] = 'wlo1' - os.environ['TP_SOCKET_IFNAME'] = 'wlo1' + +def launch_client(task_id: str, config: BareConfig = None, learning_params: LearningParameters = None, + namespace: Namespace = None): + """ + @param task_id: String representation (should be unique) corresponding to a client. + @type task_id: str + @param config: Configuration for components, needed for spinning up components of the Orchestrator. + @type config: BareConfig + @param learning_params: Parsed configuration of Hyper-Parameters for learning. + @type: LearningParameters + @return: None + @rtype: None + """ logging.info(f'Starting with host={os.environ["MASTER_ADDR"]} and port={os.environ["MASTER_PORT"]}') - options = rpc.TensorPipeRpcBackendOptions( - num_worker_threads=16, - rpc_timeout=0, # infinite timeout - init_method=f'tcp://{os.environ["MASTER_ADDR"]}:{os.environ["MASTER_PORT"]}' - ) - - if rank != 0: - logging.info(f'Starting worker {rank}') - rpc.init_rpc( - f"client{rank}", - rank=rank, - world_size=world_size, - rpc_backend_options=options, - ) - # trainer passively waiting for ps to kick off training iterations + rank, world_size, backend = 0, None, None + distributed = should_distribute() + if distributed: + logging.info(f'Initializing backend for training process: {namespace.backend}') + dist.init_process_group(namespace.backend) + rank = dist.get_rank() + world_size = dist.get_world_size() + + logging.info(f'Starting Creating client with {rank}') + + client = Client(rank, task_id, world_size, config, learning_params) + client.prepare_learner(distributed) + epoch_data = client.run_epochs() + print(epoch_data) + + +def launch_orchestrator(args: Namespace = None, conf: BareConfig = None): + """ + Default runner for the Orchestrator that is based on KubeFlow + @param args: Commandline arguments passed to the execution. Might be removed in a future commit. + @type args: Namespace + @param config: Configuration for execution of Orchestrators components, needed for spinning up components of the + Orchestrator. + @type config: BareConfig + @return: None + @rtype: None + """ + logging.info('Starting as Orchestrator') + logging.info("Starting Orchestrator, initializing resources....") + if args.local: + logging.info("Loading local configuration file") + config.load_kube_config() else: - logging.info('Starting the ps') - rpc.init_rpc( - "ps", - rank=rank, - world_size=world_size, - rpc_backend_options=options - - ) - run_ps([(f"client{r}", r, world_size) for r in range(1, world_size)], args) - # block until all rpc finish - rpc.shutdown() - - -def run_spawn(config): - world_size = config.world_size - master_address = config.federator_host - mp.spawn( - run_single, - args=(world_size, master_address, config), - nprocs=world_size, - join=True - ) \ No newline at end of file + logging.info("Loading in cluster configuration file") + config.load_incluster_config() + + logging.info("Pointing configuration to in cluster configuration.") + conf.cluster_config.load_incluster_namespace() + conf.cluster_config.load_incluster_image() + + arrival_generator = ExperimentGenerator() + cluster_manager = ClusterManager() + + orchestrator = Orchestrator(cluster_manager, arrival_generator, conf) + + pool = ThreadPool(3) + logging.info("Starting cluster manager") + pool.apply(cluster_manager.start) + logging.info("Starting arrival generator") + pool.apply_async(arrival_generator.start, args=[conf.get_duration()]) + logging.info("Starting orchestrator") + pool.apply(orchestrator.run) + pool.join() + + logging.info("Stopped execution of Orchestrator...") + + +def launch_extractor(args: Namespace, conf: BareConfig): + """ + Extractor launch function, will only download all models and quit execution. + @param args: Arguments passed from CLI. + @type args: Namespace + @param conf: Parsed configuration file passed from the CLI. + @type conf: BareConfig + @return: None + @rtype: None + """ + download_datasets(args, conf) diff --git a/fltk/nets/__init__.py b/fltk/nets/__init__.py index 432dbca9..8f9abe90 100644 --- a/fltk/nets/__init__.py +++ b/fltk/nets/__init__.py @@ -2,5 +2,7 @@ from .cifar_100_resnet import Cifar100ResNet from .fashion_mnist_cnn import FashionMNISTCNN from .fashion_mnist_resnet import FashionMNISTResNet -from .cifar_10_resnet import Cifar10ResNet -from .cifar_100_vgg import Cifar100VGG \ No newline at end of file +from .cifar_10_resnet import Cifar10ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152 +from .cifar_100_vgg import Cifar100VGG, vgg11_bn, vgg13_bn, vgg16_bn, vgg19_bn +from .reddit_lstm import RNNModel +from.simple import SimpleMnist, SimpleNet diff --git a/fltk/nets/cifar_100_resnet.py b/fltk/nets/cifar_100_resnet.py index 4651fe25..5e9826a3 100644 --- a/fltk/nets/cifar_100_resnet.py +++ b/fltk/nets/cifar_100_resnet.py @@ -1,20 +1,20 @@ import torch.nn as nn -import torch.nn.functional as F + class BasicBlock(nn.Module): """Basic Block for resnet 18 and resnet 34 """ - #BasicBlock and BottleNeck block - #have different output size - #we use class attribute expansion - #to distinct + # BasicBlock and BottleNeck block + # have different output size + # we use class attribute expansion + # to distinct expansion = 1 def __init__(self, in_channels, out_channels, stride=1): super().__init__() - #residual function + # residual function self.residual_function = nn.Sequential( nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False), nn.BatchNorm2d(out_channels), @@ -23,11 +23,11 @@ def __init__(self, in_channels, out_channels, stride=1): nn.BatchNorm2d(out_channels * BasicBlock.expansion) ) - #shortcut + # shortcut self.shortcut = nn.Sequential() - #the shortcut output dimension is not the same with residual function - #use 1*1 convolution to match the dimension + # the shortcut output dimension is not the same with residual function + # use 1*1 convolution to match the dimension if stride != 1 or in_channels != BasicBlock.expansion * out_channels: self.shortcut = nn.Sequential( nn.Conv2d(in_channels, out_channels * BasicBlock.expansion, kernel_size=1, stride=stride, bias=False), @@ -37,10 +37,12 @@ def __init__(self, in_channels, out_channels, stride=1): def forward(self, x): return nn.ReLU(inplace=True)(self.residual_function(x) + self.shortcut(x)) -class BottleNeck(nn.Module): + +class Bottleneck(nn.Module): """Residual block for resnet over 50 layers """ expansion = 4 + def __init__(self, in_channels, out_channels, stride=1): super().__init__() self.residual_function = nn.Sequential( @@ -50,25 +52,27 @@ def __init__(self, in_channels, out_channels, stride=1): nn.Conv2d(out_channels, out_channels, stride=stride, kernel_size=3, padding=1, bias=False), nn.BatchNorm2d(out_channels), nn.ReLU(inplace=True), - nn.Conv2d(out_channels, out_channels * BottleNeck.expansion, kernel_size=1, bias=False), - nn.BatchNorm2d(out_channels * BottleNeck.expansion), + nn.Conv2d(out_channels, out_channels * Bottleneck.expansion, kernel_size=1, bias=False), + nn.BatchNorm2d(out_channels * Bottleneck.expansion), ) self.shortcut = nn.Sequential() - if stride != 1 or in_channels != out_channels * BottleNeck.expansion: + if stride != 1 or in_channels != out_channels * Bottleneck.expansion: self.shortcut = nn.Sequential( - nn.Conv2d(in_channels, out_channels * BottleNeck.expansion, stride=stride, kernel_size=1, bias=False), - nn.BatchNorm2d(out_channels * BottleNeck.expansion) + nn.Conv2d(in_channels, out_channels * Bottleneck.expansion, stride=stride, kernel_size=1, bias=False), + nn.BatchNorm2d(out_channels * Bottleneck.expansion) ) def forward(self, x): return nn.ReLU(inplace=True)(self.residual_function(x) + self.shortcut(x)) -class Cifar100ResNet(nn.Module): - def __init__(self, block = BasicBlock, num_block =[2, 2, 2, 2], num_classes=100): +class Cifar100ResNet(nn.Module): + def __init__(self, block: nn.Module = BasicBlock, num_block=None, num_classes=100): super(Cifar100ResNet, self).__init__() + if num_block is None: + num_block = [2, 2, 2, 2] self.in_channels = 64 @@ -76,8 +80,8 @@ def __init__(self, block = BasicBlock, num_block =[2, 2, 2, 2], num_classes=100) nn.Conv2d(3, 64, kernel_size=3, padding=1, bias=False), nn.BatchNorm2d(64), nn.ReLU(inplace=True)) - #we use a different inputsize than the original paper - #so conv2_x's stride is 1 + # we use a different inputsize than the original paper + # so conv2_x's stride is 1 self.conv2_x = self._make_layer(block, 64, num_block[0], 1) self.conv3_x = self._make_layer(block, 128, num_block[1], 2) self.conv4_x = self._make_layer(block, 256, num_block[2], 2) @@ -120,27 +124,27 @@ def forward(self, x): return output -def resnet18(): - """ return a ResNet 18 object - """ - return Cifar100ResNet(BasicBlock, [2, 2, 2, 2]) -def resnet34(): - """ return a ResNet 34 object - """ - return Cifar100ResNet(BasicBlock, [3, 4, 6, 3]) +class ResNet18(Cifar100ResNet): + def __init__(self): + super(ResNet18).__init__(BasicBlock, [2, 2, 2, 2]) -def resnet50(): - """ return a ResNet 50 object - """ - return Cifar100ResNet(BottleNeck, [3, 4, 6, 3]) -def resnet101(): - """ return a ResNet 101 object - """ - return Cifar100ResNet(BottleNeck, [3, 4, 23, 3]) +class ResNet34(Cifar100ResNet): + def __init__(self): + super(ResNet34).__init__(BasicBlock, [3, 4, 6, 3]) -def resnet152(): - """ return a ResNet 152 object - """ - return Cifar100ResNet(BottleNeck, [3, 8, 36, 3]) + +class ResNet50(Cifar100ResNet): + def __init__(self): + super(ResNet50).__init__(Bottleneck, [3, 4, 6, 3]) + + +class ResNet101(Cifar100ResNet): + def __init__(self): + super(ResNet101).__init__(Bottleneck, [3, 4, 23, 3]) + + +class ResNet152(Cifar100ResNet): + def __init__(self): + super(ResNet152).__init__(Bottleneck, [3, 8, 36, 3]) diff --git a/fltk/nets/cifar_100_vgg.py b/fltk/nets/cifar_100_vgg.py index 112b969a..05e72232 100644 --- a/fltk/nets/cifar_100_vgg.py +++ b/fltk/nets/cifar_100_vgg.py @@ -1,11 +1,10 @@ -import torch import torch.nn as nn cfg = { - 'A' : [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], - 'B' : [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], - 'D' : [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], - 'E' : [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'] + 'A': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], + 'B': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], + 'D': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], + 'E': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'] } @@ -28,9 +27,10 @@ def make_layers(cfg, batch_norm=False): return nn.Sequential(*layers) + class Cifar100VGG(nn.Module): - def __init__(self, features = make_layers(cfg['D'], batch_norm=True), num_class=100): + def __init__(self, features=make_layers(cfg['D'], batch_norm=True), num_class=100): super(Cifar100VGG, self).__init__() self.features = features @@ -55,11 +55,14 @@ def forward(self, x): def vgg11_bn(): return Cifar100VGG(make_layers(cfg['A'], batch_norm=True)) + def vgg13_bn(): return Cifar100VGG(make_layers(cfg['B'], batch_norm=True)) + def vgg16_bn(): return Cifar100VGG(make_layers(cfg['D'], batch_norm=True)) + def vgg19_bn(): - return Cifar100VGG(make_layers(cfg['E'], batch_norm=True)) \ No newline at end of file + return Cifar100VGG(make_layers(cfg['E'], batch_norm=True)) diff --git a/fltk/nets/cifar_10_cnn.py b/fltk/nets/cifar_10_cnn.py index bf4c0b2e..4a3fb05c 100644 --- a/fltk/nets/cifar_10_cnn.py +++ b/fltk/nets/cifar_10_cnn.py @@ -1,4 +1,3 @@ -import torch import torch.nn as nn import torch.nn.functional as F @@ -26,6 +25,8 @@ def __init__(self): self.pool3 = nn.MaxPool2d(kernel_size=2) self.fc1 = nn.Linear(128 * 4 * 4, 128) + + self.softmax = nn.Softmax() self.fc2 = nn.Linear(128, 10) def forward(self, x): @@ -44,6 +45,6 @@ def forward(self, x): x = x.view(-1, 128 * 4 * 4) x = self.fc1(x) - x = F.softmax(self.fc2(x)) + x = self.softmax(self.fc2(x)) - return x + return x \ No newline at end of file diff --git a/fltk/nets/cifar_10_resnet.py b/fltk/nets/cifar_10_resnet.py index 08a22313..9eb3106a 100644 --- a/fltk/nets/cifar_10_resnet.py +++ b/fltk/nets/cifar_10_resnet.py @@ -15,11 +15,11 @@ def __init__(self, in_planes, planes, stride=1): self.bn2 = nn.BatchNorm2d(planes) self.shortcut = nn.Sequential() - if stride != 1 or in_planes != self.expansion*planes: + if stride != 1 or in_planes != self.expansion * planes: self.shortcut = nn.Sequential( - nn.Conv2d(in_planes, self.expansion*planes, + nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False), - nn.BatchNorm2d(self.expansion*planes) + nn.BatchNorm2d(self.expansion * planes) ) def forward(self, x): @@ -42,14 +42,14 @@ def __init__(self, in_planes, planes, stride=1): self.bn2 = nn.BatchNorm2d(planes) self.conv3 = nn.Conv2d(planes, self.expansion * planes, kernel_size=1, bias=False) - self.bn3 = nn.BatchNorm2d(self.expansion*planes) + self.bn3 = nn.BatchNorm2d(self.expansion * planes) self.shortcut = nn.Sequential() - if stride != 1 or in_planes != self.expansion*planes: + if stride != 1 or in_planes != self.expansion * planes: self.shortcut = nn.Sequential( - nn.Conv2d(in_planes, self.expansion*planes, + nn.Conv2d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False), - nn.BatchNorm2d(self.expansion*planes) + nn.BatchNorm2d(self.expansion * planes) ) def forward(self, x): @@ -62,7 +62,7 @@ def forward(self, x): class Cifar10ResNet(nn.Module): - def __init__(self, block = BasicBlock, num_blocks =[2, 2, 2, 2], num_classes=10): + def __init__(self, block: nn.Module = BasicBlock, num_blocks=[2, 2, 2, 2], num_classes=10): super(Cifar10ResNet, self).__init__() self.in_planes = 64 @@ -73,10 +73,10 @@ def __init__(self, block = BasicBlock, num_blocks =[2, 2, 2, 2], num_classes=10) self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) - self.linear = nn.Linear(512*block.expansion, num_classes) + self.linear = nn.Linear(512 * block.expansion, num_classes) def _make_layer(self, block, planes, num_blocks, stride): - strides = [stride] + [1]*(num_blocks-1) + strides = [stride] + [1] * (num_blocks - 1) layers = [] for stride in strides: layers.append(block(self.in_planes, planes, stride)) @@ -95,21 +95,27 @@ def forward(self, x): return out -def ResNet18(): - return Cifar10ResNet(BasicBlock, [2, 2, 2, 2]) +class ResNet18(Cifar10ResNet): + def __init__(self): + super(ResNet18, self).__init__(BasicBlock, [2, 2, 2, 2]) -def ResNet34(): - return Cifar10ResNet(BasicBlock, [3, 4, 6, 3]) +class ResNet34(Cifar10ResNet): + def __init__(self): + super(ResNet34, self).__init__(BasicBlock, [3, 4, 6, 3]) -def ResNet50(): - return Cifar10ResNet(Bottleneck, [3, 4, 6, 3]) +class ResNet50(Cifar10ResNet): + def __init__(self): + super(ResNet50, self).__init__(Bottleneck, [3, 4, 6, 3]) -def ResNet101(): - return Cifar10ResNet(Bottleneck, [3, 4, 23, 3]) +class ResNet101(Cifar10ResNet): + def __init__(self): + super(ResNet101, self).__init__(Bottleneck, [3, 4, 23, 3]) -def ResNet152(): - return Cifar10ResNet(Bottleneck, [3, 8, 36, 3]) \ No newline at end of file + +class ResNet152(Cifar10ResNet): + def __init__(self): + super(ResNet152, self).__init__(Bottleneck, [3, 8, 36, 3]) diff --git a/fltk/nets/fashion_mnist_cnn.py b/fltk/nets/fashion_mnist_cnn.py index 0c4532c4..a7c680d6 100644 --- a/fltk/nets/fashion_mnist_cnn.py +++ b/fltk/nets/fashion_mnist_cnn.py @@ -1,6 +1,5 @@ -import torch import torch.nn as nn -import torch.nn.functional as F + class FashionMNISTCNN(nn.Module): @@ -17,15 +16,12 @@ def __init__(self): nn.BatchNorm2d(32), nn.ReLU(), nn.MaxPool2d(2)) - - self.fc = nn.Linear(7*7*32, 10) + self.flatten = nn.Flatten() + self.fc = nn.Linear(7 * 7 * 32, 10) def forward(self, x): x = self.layer1(x) x = self.layer2(x) - - x = x.view(x.size(0), -1) - - x = self.fc(x) + x = self.fc(self.flatten(x)) return x diff --git a/fltk/nets/fashion_mnist_resnet.py b/fltk/nets/fashion_mnist_resnet.py index 43e09925..b323da26 100644 --- a/fltk/nets/fashion_mnist_resnet.py +++ b/fltk/nets/fashion_mnist_resnet.py @@ -1,62 +1,67 @@ -import torch import torch.nn as nn -import torch.nn.functional as F + class Residual(nn.Module): - def __init__(self,in_channel,num_channel,use_conv1x1=False,strides=1): - super(Residual,self).__init__() - self.relu=nn.ReLU() - self.bn1=nn.BatchNorm2d(in_channel,eps=1e-3) - self.conv1=nn.Conv2d(in_channels =in_channel,out_channels=num_channel,kernel_size=3,padding=1,stride=strides) - self.bn2=nn.BatchNorm2d(num_channel,eps=1e-3) - self.conv2=nn.Conv2d(in_channels=num_channel,out_channels=num_channel,kernel_size=3,padding=1) + def __init__(self, in_channel, num_channel, use_conv1x1=False, strides=1): + super(Residual, self).__init__() + self.relu = nn.ReLU() + self.bn1 = nn.BatchNorm2d(in_channel, eps=1e-3) + self.conv1 = nn.Conv2d(in_channels=in_channel, out_channels=num_channel, kernel_size=3, padding=1, + stride=strides) + self.bn2 = nn.BatchNorm2d(num_channel, eps=1e-3) + self.conv2 = nn.Conv2d(in_channels=num_channel, out_channels=num_channel, kernel_size=3, padding=1) if use_conv1x1: - self.conv3=nn.Conv2d(in_channels=in_channel,out_channels=num_channel,kernel_size=1,stride=strides) + self.conv3 = nn.Conv2d(in_channels=in_channel, out_channels=num_channel, kernel_size=1, stride=strides) else: - self.conv3=None - + self.conv3 = None def forward(self, x): - y=self.conv1(self.relu(self.bn1(x))) - y=self.conv2(self.relu(self.bn2(y))) + y = self.conv1(self.relu(self.bn1(x))) + y = self.conv2(self.relu(self.bn2(y))) # print (y.shape) if self.conv3: - x=self.conv3(x) + x = self.conv3(x) # print (x.shape) - z=y+x + z = y + x return z -def ResNet_block(in_channels,num_channels,num_residuals,first_block=False): - layers=[] + +def ResNet_block(in_channels, num_channels, num_residuals, first_block=False): + layers = [] for i in range(num_residuals): - if i==0 and not first_block: - layers += [Residual(in_channels,num_channels,use_conv1x1=True,strides=2)] - elif i>0 and not first_block: - layers += [Residual(num_channels,num_channels)] + if i == 0 and not first_block: + layers += [Residual(in_channels, num_channels, use_conv1x1=True, strides=2)] + elif i > 0 and not first_block: + layers += [Residual(num_channels, num_channels)] else: layers += [Residual(in_channels, num_channels)] - blk=nn.Sequential(*layers) + blk = nn.Sequential(*layers) return blk + class FashionMNISTResNet(nn.Module): - def __init__(self,in_channel = 1 ,num_classes = 10): - super(FashionMNISTResNet,self).__init__() - self.block1=nn.Sequential(nn.Conv2d(in_channels=in_channel,out_channels=64,kernel_size=7,stride=2,padding=3), - nn.BatchNorm2d(64), - nn.ReLU(), - nn.MaxPool2d(kernel_size=3,stride=2,padding=1)) - self.block2=nn.Sequential(ResNet_block(64,64,2,True), - ResNet_block(64,128,2), - ResNet_block(128,256,2), - ResNet_block(256,512,2)) - self.block3=nn.Sequential(nn.AvgPool2d(kernel_size=3)) - self.Dense=nn.Linear(512,10) - - - def forward(self,x): - y=self.block1(x) - y=self.block2(y) - y=self.block3(y) - y=y.view(-1,512) - y=self.Dense(y) - return y \ No newline at end of file + def __init__(self, in_channel=1, num_classes=10): + super(FashionMNISTResNet, self).__init__() + self.block1 = nn.Sequential( + nn.Conv2d(in_channels=in_channel, out_channels=64, kernel_size=7, stride=2, padding=3), + nn.BatchNorm2d(64), + nn.ReLU(), + nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) + self.block2 = nn.Sequential(ResNet_block(64, 64, 2, True), + ResNet_block(64, 128, 2), + ResNet_block(128, 256, 2), + ResNet_block(256, 512, 2)) + # Set to adaptive as ResNet default average pool is not compatible with shape + # (512, 1, 1) and kernel size=3 (... - 1 in size, so needs padding or AdaptiveAvgPool2d). + self.block3 = nn.AdaptiveAvgPool2d((1, 1)) + # Instead of reshape/view use Flatten layer to perform flattening for 'Dense' layer for readability. + self.flatten = nn.Flatten() + self.Dense = nn.Linear(512, num_classes) + + def forward(self, x): + y = self.block1(x) + y = self.block2(y) + + y = self.block3(y) + y = self.Dense(self.flatten(y)) + return y diff --git a/fltk/nets/reddit_lstm.py b/fltk/nets/reddit_lstm.py index 4b7f71f9..dc83de9e 100644 --- a/fltk/nets/reddit_lstm.py +++ b/fltk/nets/reddit_lstm.py @@ -1,7 +1,8 @@ import torch.nn as nn -from fltk.nets.simple import SimpleNet from torch.autograd import Variable +from fltk.nets.simple import SimpleNet + class RNNModel(SimpleNet): """Container module with an encoder, a recurrent module, and a decoder.""" @@ -16,7 +17,7 @@ def __init__(self, name, created_time, rnn_type, ntoken, ninp, nhid, nlayers, dr try: nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type] except KeyError: - raise ValueError( """An invalid option for `--model` was supplied, + raise ValueError("""An invalid option for `--model` was supplied, options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""") self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout) self.decoder = nn.Linear(nhid, ntoken) @@ -49,7 +50,7 @@ def forward(self, input, hidden): emb = self.drop(self.encoder(input)) output, hidden = self.rnn(emb, hidden) output = self.drop(output) - decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2))) + decoded = self.decoder(output.view(output.size(0) * output.size(1), output.size(2))) return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden def init_hidden(self, bsz): @@ -58,4 +59,4 @@ def init_hidden(self, bsz): return (Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()), Variable(weight.new(self.nlayers, bsz, self.nhid).zero_())) else: - return Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()) \ No newline at end of file + return Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()) diff --git a/fltk/nets/simple.py b/fltk/nets/simple.py index 7c7a5e76..1cb4987f 100644 --- a/fltk/nets/simple.py +++ b/fltk/nets/simple.py @@ -1,55 +1,43 @@ -import argparse +import numpy as np import torch import torch.nn as nn import torch.nn.functional as F -import torch.optim as optim -from torchvision import datasets, transforms -from torch.autograd import Variable -import numpy as np -import datetime class SimpleNet(nn.Module): def __init__(self, name=None, created_time=None): super(SimpleNet, self).__init__() self.created_time = created_time - self.name=name - - + self.name = name def visualize(self, vis, epoch, acc, loss=None, eid='main', is_poisoned=False, name=None): if name is None: name = self.name + '_poisoned' if is_poisoned else self.name vis.line(X=np.array([epoch]), Y=np.array([acc]), name=name, win='vacc_{0}'.format(self.created_time), env=eid, - update='append' if vis.win_exists('vacc_{0}'.format(self.created_time), env=eid) else None, - opts=dict(showlegend=True, title='Accuracy_{0}'.format(self.created_time), - width=700, height=400)) + update='append' if vis.win_exists('vacc_{0}'.format(self.created_time), env=eid) else None, + opts=dict(showlegend=True, title='Accuracy_{0}'.format(self.created_time), + width=700, height=400)) if loss is not None: vis.line(X=np.array([epoch]), Y=np.array([loss]), name=name, env=eid, - win='vloss_{0}'.format(self.created_time), - update='append' if vis.win_exists('vloss_{0}'.format(self.created_time), env=eid) else None, - opts=dict(showlegend=True, title='Loss_{0}'.format(self.created_time), width=700, height=400)) + win='vloss_{0}'.format(self.created_time), + update='append' if vis.win_exists('vloss_{0}'.format(self.created_time), env=eid) else None, + opts=dict(showlegend=True, title='Loss_{0}'.format(self.created_time), width=700, height=400)) return - - def train_vis(self, vis, epoch, data_len, batch, loss, eid='main', name=None, win='vtrain'): - vis.line(X=np.array([(epoch-1)*data_len+batch]), Y=np.array([loss]), - env=eid, - name=f'{name}' if name is not None else self.name, win=f'{win}_{self.created_time}', - update='append' if vis.win_exists(f'{win}_{self.created_time}', env=eid) else None, - opts=dict(showlegend=True, width=700, height=400, title='Train loss_{0}'.format(self.created_time))) - - + vis.line(X=np.array([(epoch - 1) * data_len + batch]), Y=np.array([loss]), + env=eid, + name=f'{name}' if name is not None else self.name, win=f'{win}_{self.created_time}', + update='append' if vis.win_exists(f'{win}_{self.created_time}', env=eid) else None, + opts=dict(showlegend=True, width=700, height=400, title='Train loss_{0}'.format(self.created_time))) def save_stats(self, epoch, loss, acc): self.stats['epoch'].append(epoch) self.stats['loss'].append(loss) self.stats['acc'].append(acc) - def copy_params(self, state_dict, coefficient_transfer=100): own_state = self.state_dict() @@ -62,13 +50,11 @@ def copy_params(self, state_dict, coefficient_transfer=100): # torch.cuda.FloatTensor) random_tensor = (torch.FloatTensor(shape).random_(0, 100) <= coefficient_transfer).type( torch.FloatTensor) - negative_tensor = (random_tensor*-1)+1 + negative_tensor = (random_tensor * -1) + 1 # own_state[name].copy_(param) own_state[name].copy_(param.clone()) - - class SimpleMnist(SimpleNet): def __init__(self, name=None, created_time=None): super(SimpleMnist, self).__init__(name, created_time) @@ -78,7 +64,6 @@ def __init__(self, name=None, created_time=None): self.fc1 = nn.Linear(320, 50) self.fc2 = nn.Linear(50, 10) - def forward(self, x): x = F.relu(F.max_pool2d(self.conv1(x), 2)) x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) @@ -86,4 +71,4 @@ def forward(self, x): x = F.relu(self.fc1(x)) x = F.dropout(x, training=self.training) x = self.fc2(x) - return F.log_softmax(x, dim=1) \ No newline at end of file + return F.log_softmax(x, dim=1) diff --git a/fltk/nets/util/__init__.py b/fltk/nets/util/__init__.py new file mode 100644 index 00000000..5e41e6c0 --- /dev/null +++ b/fltk/nets/util/__init__.py @@ -0,0 +1,3 @@ +from .reproducability import init_reproducibility +from .model import save_model, flatten_params, recover_flattened, load_model_from_file +from .evaluation import calculate_class_recall, calculate_class_precision \ No newline at end of file diff --git a/fltk/util/fed_avg.py b/fltk/nets/util/aggregration.py similarity index 78% rename from fltk/util/fed_avg.py rename to fltk/nets/util/aggregration.py index e60d1684..7495b620 100644 --- a/fltk/util/fed_avg.py +++ b/fltk/nets/util/aggregration.py @@ -1,6 +1,6 @@ def average_nn_parameters(parameters): """ - Averages passed parameters. + Takes unweighted average of a list of Tensor weights. Averages passed parameters. :param parameters: nn model named parameters :type parameters: list @@ -8,5 +8,4 @@ def average_nn_parameters(parameters): new_params = {} for name in parameters[0].keys(): new_params[name] = sum([param[name].data for param in parameters]) / len(parameters) - return new_params diff --git a/fltk/nets/util/evaluation.py b/fltk/nets/util/evaluation.py new file mode 100644 index 00000000..e990695b --- /dev/null +++ b/fltk/nets/util/evaluation.py @@ -0,0 +1,15 @@ +import numpy as np + + +def calculate_class_precision(conf_mat: np.array) -> np.array: + """ + Calculates the precision for each class from a confusion matrix. + """ + return np.diagonal(conf_mat) / np.sum(conf_mat, axis=0) + + +def calculate_class_recall(conf_mat: np.array) -> np.array: + """ + Calculates the recall for each class from a confusion matrix. + """ + return np.diagonal(conf_mat) / np.sum(conf_mat, axis=1) diff --git a/fltk/nets/util/model.py b/fltk/nets/util/model.py new file mode 100644 index 00000000..9f98c8ec --- /dev/null +++ b/fltk/nets/util/model.py @@ -0,0 +1,116 @@ +import logging +from collections import OrderedDict +from pathlib import Path +from typing import Union + +import torch +from torch.utils.tensorboard import SummaryWriter + +from fltk.util.config.base_config import BareConfig +from fltk.util.results import EpochData + + +def flatten_params(model_description: Union[torch.nn.Module, OrderedDict]): + """ + flattens all parameters into a single column vector. Returns the dictionary to recover them + :param: parameters: a generator or list of all the parameters + :return: a dictionary: {"params": [#params, 1], + "indices": [(start index, end index) for each param] **Note end index in uninclusive** + """ + if isinstance(model_description, torch.nn.Module): + parameters = model_description.parameters() + else: + parameters = model_description.values() + l = [torch.flatten(p) for p in parameters] + flat = torch.cat(l).view(-1, 1) + return flat + + +def recover_flattened(flat_params, model): + """ + Gives a list of recovered parameters from their flattened form + :param flat_params: [#params, 1] + :param indices: a list detaling the start and end index of each param [(start, end) for param] + :param model: the model that gives the params with correct shapes + :return: the params, reshaped to the ones in the model, with the same order as those in the model + """ + indices = [] + s = 0 + for p in model.parameters(): + size = p.shape[0] + indices.append((s, s + size)) + s += size + l = [flat_params[s:e] for (s, e) in indices] + for i, p in enumerate(model.parameters()): + l[i] = l[i].view(*p.shape) + return l + + +def initialize_default_model(config: BareConfig, model_class) -> torch.nn.Module: + """ + Load a default model dictionary into a torch model. + @param model: + @type model: + @param config: + @type config: + @return: + @rtype: + """ + model = model_class() + default_model_path = f"{config.get_default_model_folder_path()}/{model_class.__name__}.model" + model.load_state_dict(torch.load(default_model_path)) + return model + + +def load_model_from_file(model: torch.nn.Module, model_file_path: Path) -> None: + """ + Function to load a PyTorch state_dict model file into a network instance, inplace. This requires the model + file to be of the same type type. + + @param model: Instantiated PyTorch module corresponding to the to be loaded network. + @type model: torch.nn.Module + @param model_file_path: Path to h5s file generated by PyTorch. Can be generated for a network by using the + PyTorch torch.save(module.state_dict()) syntax. + @type model_file_path: Path + @return: None + @rtype: None + """ + + if model_file_path.is_file(): + try: + model.load_state_dict(torch.load(model_file_path)) + except Exception as e: + logging.warning("Couldn't load model. Attempting to map CUDA tensors to CPU to solve error.") + else: + logging.warning("Could not find model: {}".format(model_file_path)) + raise FileExistsError(f"Cannot load model file {model_file_path} into {model}...") + + +def save_model(model: torch.nn.Module, directory: str, epoch: int): + """ + Saves the model if necessary. + """ + full_save_path = f"./{directory}/{model.__class__.__name__}_{epoch}.pth" + torch.save(model.state_dict(), full_save_path) + + +def test_model(model, epoch, writer: SummaryWriter = None) -> EpochData: + """ + Function to test model during training with + @return: + @rtype: + """ + # Test interleaved to speed up execution, i.e. don't keep the clients waiting. + accuracy, loss, class_precision, class_recall = model.test() + data = EpochData(epoch_id=epoch, + duration_train=0, + duration_test=0, + loss_train=0, + accuracy=accuracy, + loss=loss, + class_precision=class_precision, + class_recall=class_recall, + client_id='federator') + if writer: + writer.add_scalar('accuracy per epoch', accuracy, epoch) + return data diff --git a/fltk/nets/util/reproducability.py b/fltk/nets/util/reproducability.py new file mode 100644 index 00000000..217ef168 --- /dev/null +++ b/fltk/nets/util/reproducability.py @@ -0,0 +1,45 @@ +import os + +import numpy as np +import torch + + +def cuda_reproducible_backend(cuda: bool) -> None: + """ + Function to set the CUDA backend to reproducible (i.e. deterministic) or to default configuration (per PyTorch + 1.9.1). + @param cuda: Parameter to set or unset the reproducability of the PyTorch CUDA backend. + @type cuda: bool + @return: None + @rtype: None + """ + if cuda: + torch.backends.cudnn.benchmark = False + torch.backends.cudnn.deterministic = True + else: + torch.backends.cudnn.benchmark = True + torch.backends.cudnn.deterministic = False + + +def init_reproducibility(torch_seed: int = 42, cuda: bool = False, numpy_seed: int = 43, hash_seed: int = 44) -> None: + """ + Function to pre-set all seeds for libraries used during training. Allows for re-producible network initialization, + and non-deterministic number generation. Allows to prevent 'lucky' draws in network initialization. + @param torch_seed: Integer seed to use for the PyTorch PRNG and CUDA PRNG. + @type torch_seed: int + @param cuda: Flag to indicate whether the CUDA backend needs to be + @type cuda: bool + @param numpy_seed: Integer seed to use for NumPy's PRNG. + @type numpy_seed: int + @param hash_seed: Integer seed to use for Pythons Hash function PRNG, will set the + @type hash_seed: int + + @return: None + @rtype: None + """ + torch.manual_seed(torch_seed) + if cuda: + torch.cuda.manual_seed_all(torch_seed) + cuda_reproducible_backend(True) + np.random.seed(numpy_seed) + os.environ['PYTHONHASHSEED'] = str(hash_seed) diff --git a/fltk/orchestrator.py b/fltk/orchestrator.py new file mode 100644 index 00000000..174626a1 --- /dev/null +++ b/fltk/orchestrator.py @@ -0,0 +1,132 @@ +import logging +import time +import uuid +from queue import PriorityQueue +from typing import List + +from kubeflow.pytorchjob import PyTorchJobClient +from kubeflow.pytorchjob.constants.constants import PYTORCHJOB_GROUP, PYTORCHJOB_VERSION, PYTORCHJOB_PLURAL +from kubernetes import client + +from fltk.util.cluster.client import construct_job, ClusterManager +from fltk.util.config.base_config import BareConfig +from fltk.util.task.generator.arrival_generator import ArrivalGenerator, Arrival +from fltk.util.task.task import ArrivalTask + + +class Orchestrator(object): + """ + Central component of the Federated Learning System: The Orchestrator + + The Orchestrator is in charge of the following tasks: + - Running experiments + - Creating and/or managing tasks + - Keep track of progress (pending/started/failed/completed) + - Keep track of timing + + Note that the Orchestrator does not function like a Federator, in the sense that it keeps a central model, performs + aggregations and keeps track of Clients. For this, the KubeFlow PyTorch-Operator is used to deploy a train task as + a V1PyTorchJob, which automatically generates the required setup in the cluster. In addition, this allows more Jobs + to be scheduled, than that there are resources, as such, letting the Kubernetes Scheduler let decide when to run + which containers where. + """ + _alive = False + # Priority queue, requires an orderable object, otherwise a Tuple[int, Any] can be used to insert. + pending_tasks: "PriorityQueue[ArrivalTask]" = PriorityQueue() + deployed_tasks: List[ArrivalTask] = [] + completed_tasks: List[str] = [] + + def __init__(self, cluster_mgr: ClusterManager, arv_gen: ArrivalGenerator, config: BareConfig): + self.__logger = logging.getLogger('Orchestrator') + self.__logger.debug("Loading in-cluster configuration") + self.__cluster_mgr = cluster_mgr + self.__arrival_generator = arv_gen + self._config = config + + # API to interact with the cluster. + self.__client = PyTorchJobClient() + + def stop(self) -> None: + """ + Stop the Orchestrator. + @return: + @rtype: + """ + self.__logger.info("Received stop signal for the Orchestrator.") + self._alive = False + + def run(self, clear: bool = True) -> None: + """ + Main loop of the Orchestartor. + @param clear: Boolean indicating whether a previous deployment needs to be cleaned up (i.e. lingering jobs that + were deployed by the previous run). + + @type clear: bool + @return: None + @rtype: None + """ + self._alive = True + start_time = time.time() + if clear: + self.__clear_jobs() + while self._alive and time.time() - start_time < self._config.get_duration(): + # 1. Check arrivals + # If new arrivals, store them in arrival list + while not self.__arrival_generator.arrivals.empty(): + arrival: Arrival = self.__arrival_generator.arrivals.get() + unique_identifier: uuid.UUID = uuid.uuid4() + task = ArrivalTask(priority=arrival.get_priority(), + id=unique_identifier, + network=arrival.get_network(), + dataset=arrival.get_dataset(), + sys_conf=arrival.get_system_config(), + param_conf=arrival.get_parameter_config()) + + self.__logger.debug(f"Arrival of: {task}") + self.pending_tasks.put(task) + + while not self.pending_tasks.empty(): + # Do blocking request to priority queue + curr_task = self.pending_tasks.get() + self.__logger.info(f"Scheduling arrival of Arrival: {curr_task.id}") + job_to_start = construct_job(self._config, curr_task) + + + # Hack to overcome limitation of KubeFlow version (Made for older version of Kubernetes) + self.__logger.info(f"Deploying on cluster: {curr_task.id}") + self.__client.create(job_to_start, namespace=self._config.cluster_config.namespace) + self.deployed_tasks.append(curr_task) + + # TODO: Extend this logic in your real project, this is only meant for demo purposes + # For now we exit the thread after scheduling a single task. + + self.stop() + return + + self.__logger.debug("Still alive...") + time.sleep(5) + + logging.info(f'Experiment completed, currently does not support waiting.') + + def __clear_jobs(self): + """ + Function to clear existing jobs in the environment (i.e. old experiments/tests) + @return: None + @rtype: None + """ + namespace = self._config.cluster_config.namespace + self.__logger.info(f'Clearing old jobs in current namespace: {namespace}') + + for job in self.__client.get(namespace=self._config.cluster_config.namespace)['items']: + job_name = job['metadata']['name'] + self.__logger.info(f'Deleting: {job_name}') + try: + self.__client.custom_api.delete_namespaced_custom_object( + PYTORCHJOB_GROUP, + PYTORCHJOB_VERSION, + namespace, + PYTORCHJOB_PLURAL, + job_name) + except Exception as e: + self.__logger.warning(f'Could not delete: {job_name}') + print(e) diff --git a/fltk/schedulers/__init__.py b/fltk/schedulers/__init__.py index d5bf41f5..b1039496 100644 --- a/fltk/schedulers/__init__.py +++ b/fltk/schedulers/__init__.py @@ -1 +1 @@ -from .min_lr_step import MinCapableStepLR +from .min_lr_step import MinCapableStepLR, LearningScheduler diff --git a/fltk/schedulers/min_lr_step.py b/fltk/schedulers/min_lr_step.py index 6a44fdcb..0cea18ae 100644 --- a/fltk/schedulers/min_lr_step.py +++ b/fltk/schedulers/min_lr_step.py @@ -1,9 +1,22 @@ -class MinCapableStepLR: +import abc +import logging - def __init__(self, logger, optimizer, step_size, gamma, min_lr): +import torch + + +class LearningScheduler(abc.ABC): + + @abc.abstractmethod + def step(self): + raise NotImplementedError() + + +class MinCapableStepLR(LearningScheduler): + + def __init__(self, optimizer: torch.optim.Optimizer, step_size, gamma, min_lr): """ :param logger: logger - :type logger: loguru.logger + :type logger: logger :param optimizer: :type optimizer: torch.optim :param step_size: # of epochs between LR updates @@ -13,7 +26,7 @@ def __init__(self, logger, optimizer, step_size, gamma, min_lr): :param min_lr: minimum learning rate :type min_lr: float """ - self.logger = logger + self.logger = logging.getLogger('MinCapableStepLR') self.optimizer = optimizer self.step_size = step_size diff --git a/fltk/strategy/aggregation.py b/fltk/strategy/aggregation.py index e062d78c..f18ac1aa 100644 --- a/fltk/strategy/aggregation.py +++ b/fltk/strategy/aggregation.py @@ -1,11 +1,8 @@ - - - def average_nn_parameters(parameters): """ - Averages passed parameters. - :param parameters: nn model named parameters - :type parameters: list + @deprecated Average passed parameters. + @param parameters: nn model named parameters + @type parameters: list """ new_params = {} for name in parameters[0].keys(): @@ -13,18 +10,26 @@ def average_nn_parameters(parameters): return new_params + def fed_average_nn_parameters(parameters, sizes): + """ + @deprecated Federated Average passed parameters. + @param parameters: nn model named parameters + @type parameters: list + @param sizes: + @type sizes: + """ new_params = {} sum_size = 0 for client in parameters: for name in parameters[client].keys(): try: new_params[name].data += (parameters[client][name].data * sizes[client]) - except: + except Exception as e: new_params[name] = (parameters[client][name].data * sizes[client]) sum_size += sizes[client] for name in new_params: new_params[name].data /= sum_size - return new_params \ No newline at end of file + return new_params diff --git a/fltk/strategy/data_samplers.py b/fltk/strategy/data_samplers.py index 98eba58e..e452e14f 100644 --- a/fltk/strategy/data_samplers.py +++ b/fltk/strategy/data_samplers.py @@ -1,16 +1,18 @@ -from torchvision import datasets, transforms -import random import logging -from torch.utils.data import DistributedSampler, Dataset -from typing import Iterator +import random from collections import Counter +from typing import Iterator + import numpy as np +from torch.utils.data import DistributedSampler, Dataset + class DistributedSamplerWrapper(DistributedSampler): indices = [] epoch_size = 1.0 - def __init__(self, dataset: Dataset, num_replicas = None, - rank = None, seed = 0) -> None: + + def __init__(self, dataset: Dataset, num_replicas=None, + rank=None, seed=0) -> None: super().__init__(dataset, num_replicas=num_replicas, rank=rank) self.client_id = rank - 1 @@ -18,7 +20,6 @@ def __init__(self, dataset: Dataset, num_replicas = None, self.n_labels = len(dataset.classes) self.seed = seed - def order_by_label(self, dataset): # order the indices by label ordered_by_label = [[] for i in range(len(dataset.classes))] @@ -38,20 +39,20 @@ def set_epoch_size(self, epoch_size: float) -> None: self.epoch_size = epoch_size def __iter__(self) -> Iterator[int]: - random.seed(self.rank+self.epoch) + random.seed(self.rank + self.epoch) epochs_todo = self.epoch_size indices = [] - while(epochs_todo > 0.0): + while (epochs_todo > 0.0): random.shuffle(self.indices) if epochs_todo >= 1.0: indices.extend(self.indices) else: - end_index = int(round(len(self.indices)*epochs_todo)) + end_index = int(round(len(self.indices) * epochs_todo)) indices.extend(self.indices[:end_index]) epochs_todo = epochs_todo - 1 - ratio = len(indices)/float(len(self.indices)) + ratio = len(indices) / float(len(self.indices)) np.testing.assert_almost_equal(ratio, self.epoch_size, decimal=2) return iter(indices) @@ -59,14 +60,16 @@ def __iter__(self) -> Iterator[int]: def __len__(self) -> int: return len(self.indices) + class LimitLabelsSampler(DistributedSamplerWrapper): """ A sampler that limits the number of labels per client """ + def __init__(self, dataset, num_replicas, rank, args=(5, 42)): limit, seed = args super().__init__(dataset, num_replicas, rank, seed) - + if self.n_clients % self.n_labels != 0: logging.error( "multiples of {} clients are needed for the 'limiting-labels' data distribution method, {} does not work".format( @@ -143,6 +146,7 @@ def __init__(self, dataset, num_replicas, rank, args=(5, 42)): self.indices = indices + class Probability_q_Sampler(DistributedSamplerWrapper): """ Clients are divided among M groups, with M being the number of labels. @@ -155,11 +159,11 @@ class Probability_q_Sampler(DistributedSamplerWrapper): def __init__(self, dataset, num_replicas, rank, args=(0.5, 42)): q, seed = args super().__init__(dataset, num_replicas, rank, seed) - + if self.n_clients % self.n_labels != 0: logging.error( "multiples of {} clients are needed for the 'probability-q-sampler' data distribution method, {} does not work".format( - self.n_labels,self.n_clients)) + self.n_labels, self.n_clients)) return # divide data among groups @@ -193,13 +197,15 @@ def __init__(self, dataset, num_replicas, rank, args=(0.5, 42)): self.indices = indices + class DirichletSampler(DistributedSamplerWrapper): """ Generates a (non-iid) data distribution by sampling the dirichlet distribution. Dirichlet constructs a vector of length num_clients, that sums to one. Decreasing alpha results in a more non-iid data set. This distribution method results in both label and quantity skew. """ - def __init__(self, dataset: Dataset, num_replicas = None, - rank = None, args = (0.5, 42)) -> None: + + def __init__(self, dataset: Dataset, num_replicas=None, + rank=None, args=(0.5, 42)) -> None: alpha, seed = args super().__init__(dataset, num_replicas=num_replicas, rank=rank, seed=seed) @@ -209,7 +215,7 @@ def __init__(self, dataset: Dataset, num_replicas = None, for labels in ordered_by_label: n_samples = len(labels) # generate an allocation by sampling dirichlet, which results in how many samples each client gets - allocation = np.random.dirichlet([alpha] * self.n_clients) * n_samples + allocation = np.random.dirichlet([alpha] * self.n_clients) * n_samples allocation = allocation.astype(int) start_index = allocation[0:self.client_id].sum() end_index = 0 @@ -230,19 +236,21 @@ def __init__(self, dataset: Dataset, num_replicas = None, self.indices = indices + class UniformSampler(DistributedSamplerWrapper): def __init__(self, dataset, num_replicas=None, rank=None, seed=0): super().__init__(dataset, num_replicas=num_replicas, rank=rank, seed=seed) indices = list(range(len(self.dataset))) self.indices = indices[self.rank:self.total_size:self.num_replicas] + def get_sampler(dataset, args): sampler = None if args.get_distributed(): method = args.get_sampler() args.get_logger().info( "Using {} sampler method, with args: {}".format(method, args.get_sampler_args())) - + if method == "uniform": sampler = UniformSampler(dataset, num_replicas=args.get_world_size(), rank=args.get_rank()) elif method == "q sampler": diff --git a/fltk/util/arguments.py b/fltk/util/arguments.py deleted file mode 100644 index 7945113d..00000000 --- a/fltk/util/arguments.py +++ /dev/null @@ -1,293 +0,0 @@ -import torch.nn.functional as F - -import torch -import json - -# Setting the seed for Torch -import yaml - -from fltk.nets import Cifar10CNN, FashionMNISTCNN, Cifar100ResNet, FashionMNISTResNet, Cifar10ResNet, Cifar100VGG - -SEED = 1 -torch.manual_seed(SEED) - -class Arguments: - - def __init__(self, logger): - self.logger = logger - - self.batch_size = 10 - self.test_batch_size = 1000 - self.epochs = 1 - self.lr = 0.001 - self.momentum = 0.9 - self.cuda = False - self.shuffle = False - self.log_interval = 10 - self.kwargs = {} - self.contribution_measurement_round = 1 - self.contribution_measurement_metric = 'Influence' - - self.scheduler_step_size = 50 - self.scheduler_gamma = 0.5 - self.min_lr = 1e-10 - - self.round_worker_selection_strategy = None - self.round_worker_selection_strategy_kwargs = None - - self.save_model = False - self.save_temp_model = False - self.save_epoch_interval = 1 - self.save_model_path = "models" - self.epoch_save_start_suffix = "start" - self.epoch_save_end_suffix = "end" - self.get_poison_effort = 'half' - self.num_workers = 50 - # self.num_poisoned_workers = 10 - - self.rank = 0 - self.world_size = 0 - self.data_sampler = None - self.distributed = False - self.available_nets = { - "Cifar100ResNet" : Cifar100ResNet, - "Cifar100VGG" : Cifar100VGG, - "Cifar10CNN" : Cifar10CNN, - "Cifar10ResNet" : Cifar10ResNet, - "FashionMNISTCNN" : FashionMNISTCNN, - "FashionMNISTResNet" : FashionMNISTResNet - - } - self.net = None - self.set_net_by_name('Cifar10CNN') - # self.net = FashionMNISTCNN - # self.net = Cifar100ResNet - # self.net = FashionMNISTResNet - # self.net = Cifar10ResNet - # self.net = Cifar10ResNet - self.dataset_name = 'cifar10' - self.train_data_loader_pickle_path = { - 'cifar10': 'data_loaders/cifar10/train_data_loader.pickle', - 'fashion-mnist': 'data_loaders/fashion-mnist/train_data_loader.pickle', - 'cifar100': 'data_loaders/cifar100/train_data_loader.pickle', - } - - self.test_data_loader_pickle_path = { - 'cifar10': 'data_loaders/cifar10/test_data_loader.pickle', - 'fashion-mnist': 'data_loaders/fashion-mnist/test_data_loader.pickle', - 'cifar100': 'data_loaders/cifar100/test_data_loader.pickle', - } - - # self.train_data_loader_pickle_path = "data_loaders/cifar10/train_data_loader.pickle" - # self.test_data_loader_pickle_path = "data_loaders/cifar10/test_data_loader.pickle" - - # self.train_data_loader_pickle_path = "data_loaders/fashion-mnist/train_data_loader.pickle" - # self.test_data_loader_pickle_path = "data_loaders/fashion-mnist/test_data_loader.pickle" - - # self.train_data_loader_pickle_path = "data_loaders/cifar100/train_data_loader.pickle" - # self.test_data_loader_pickle_path = "data_loaders/cifar100/test_data_loader.pickle" - - self.loss_function = torch.nn.CrossEntropyLoss - - self.default_model_folder_path = "default_models" - - self.data_path = "data" - - def get_distributed(self): - return self.distributed - - def get_rank(self): - return self.rank - - def get_world_size(self): - return self.world_size - - def set_sampler(self, sampler): - self.data_sampler = sampler - - def get_sampler(self): - return self.data_sampler - - def get_round_worker_selection_strategy(self): - return self.round_worker_selection_strategy - - def get_round_worker_selection_strategy_kwargs(self): - return self.round_worker_selection_strategy_kwargs - - def set_round_worker_selection_strategy_kwargs(self, kwargs): - self.round_worker_selection_strategy_kwargs = kwargs - - def set_client_selection_strategy(self, strategy): - self.round_worker_selection_strategy = strategy - - def get_data_path(self): - return self.data_path - - def get_epoch_save_start_suffix(self): - return self.epoch_save_start_suffix - - def get_epoch_save_end_suffix(self): - return self.epoch_save_end_suffix - - def get_dataloader_list(self): - return list(self.train_data_loader_pickle_path.keys()) - - def get_nets_list(self): - return list(self.available_nets.keys()) - - - def set_train_data_loader_pickle_path(self, path, name='cifar10'): - self.train_data_loader_pickle_path[name] = path - - def get_train_data_loader_pickle_path(self): - return self.train_data_loader_pickle_path[self.dataset_name] - - def set_test_data_loader_pickle_path(self, path, name='cifar10'): - self.test_data_loader_pickle_path[name] = path - - def get_test_data_loader_pickle_path(self): - return self.test_data_loader_pickle_path[self.dataset_name] - - def set_net_by_name(self, name: str): - self.net = self.available_nets[name] - # net_dict = { - # 'cifar10-cnn': Cifar10CNN, - # 'fashion-mnist-cnn': FashionMNISTCNN, - # 'cifar100-resnet': Cifar100ResNet, - # 'fashion-mnist-resnet': FashionMNISTResNet, - # 'cifar10-resnet': Cifar10ResNet, - # 'cifar100-vgg': Cifar100VGG, - # } - # self.net = net_dict[name] - - def get_cuda(self): - return self.cuda - - def get_scheduler_step_size(self): - return self.scheduler_step_size - - def get_scheduler_gamma(self): - return self.scheduler_gamma - - def get_min_lr(self): - return self.min_lr - - def get_default_model_folder_path(self): - return self.default_model_folder_path - - def get_num_epochs(self): - return self.epochs - - def set_num_poisoned_workers(self, num_poisoned_workers): - self.num_poisoned_workers = num_poisoned_workers - - def set_num_workers(self, num_workers): - self.num_workers = num_workers - - def set_model_save_path(self, save_model_path): - self.save_model_path = save_model_path - - def get_logger(self): - return self.logger - - def get_loss_function(self): - return self.loss_function - - def get_net(self): - return self.net - - def get_num_workers(self): - return self.num_workers - - def get_num_poisoned_workers(self): - return self.num_poisoned_workers - - def get_poison_effort(self): - return self.get_poison_effort - - def get_learning_rate(self): - return self.lr - - def get_momentum(self): - return self.momentum - - def get_shuffle(self): - return self.shuffle - - def get_batch_size(self): - return self.batch_size - - def get_test_batch_size(self): - return self.test_batch_size - - def get_log_interval(self): - return self.log_interval - - def get_save_model_folder_path(self): - return self.save_model_path - - def get_learning_rate_from_epoch(self, epoch_idx): - lr = self.lr * (self.scheduler_gamma ** int(epoch_idx / self.scheduler_step_size)) - - if lr < self.min_lr: - self.logger.warning("Updating LR would place it below min LR. Skipping LR update.") - - return self.min_lr - - self.logger.debug("LR: {}".format(lr)) - - return lr - - def get_contribution_measurement_round(self): - return self.contribution_measurement_round - - def get_contribution_measurement_metric(self): - return self.contribution_measurement_metric - - def should_save_model(self, epoch_idx): - """ - Returns true/false models should be saved. - - :param epoch_idx: current training epoch index - :type epoch_idx: int - """ - if not self.save_model: - return False - - if epoch_idx == 1 or epoch_idx % self.save_epoch_interval == 0: - return True - - def log(self): - """ - Log this arguments object to the logger. - """ - self.logger.debug("Arguments: {}", str(self)) - - def __str__(self): - return "\nBatch Size: {}\n".format(self.batch_size) + \ - "Test Batch Size: {}\n".format(self.test_batch_size) + \ - "Epochs: {}\n".format(self.epochs) + \ - "Learning Rate: {}\n".format(self.lr) + \ - "Momentum: {}\n".format(self.momentum) + \ - "CUDA Enabled: {}\n".format(self.cuda) + \ - "Shuffle Enabled: {}\n".format(self.shuffle) + \ - "Log Interval: {}\n".format(self.log_interval) + \ - "Scheduler Step Size: {}\n".format(self.scheduler_step_size) + \ - "Scheduler Gamma: {}\n".format(self.scheduler_gamma) + \ - "Scheduler Minimum Learning Rate: {}\n".format(self.min_lr) + \ - "Client Selection Strategy: {}\n".format(self.round_worker_selection_strategy) + \ - "Client Selection Strategy Arguments: {}\n".format(json.dumps(self.round_worker_selection_strategy_kwargs, indent=4, sort_keys=True)) + \ - "Model Saving Enabled: {}\n".format(self.save_model) + \ - "Model Saving Interval: {}\n".format(self.save_epoch_interval) + \ - "Model Saving Path (Relative): {}\n".format(self.save_model_path) + \ - "Epoch Save Start Prefix: {}\n".format(self.epoch_save_start_suffix) + \ - "Epoch Save End Suffix: {}\n".format(self.epoch_save_end_suffix) + \ - "Number of Clients: {}\n".format(self.num_workers) + \ - "Number of Poisoned Clients: {}\n".format(self.num_poisoned_workers) + \ - "NN: {}\n".format(self.net) + \ - "Train Data Loader Path: {}\n".format(self.train_data_loader_pickle_path) + \ - "Test Data Loader Path: {}\n".format(self.test_data_loader_pickle_path) + \ - "Loss Function: {}\n".format(self.loss_function) + \ - "Default Model Folder Path: {}\n".format(self.default_model_folder_path) + \ - "Data Path: {}\n".format(self.data_path) + \ - "Dataset Name: {}\n".format(self.dataset_name) \ No newline at end of file diff --git a/fltk/util/base_config.py b/fltk/util/base_config.py deleted file mode 100644 index c814965f..00000000 --- a/fltk/util/base_config.py +++ /dev/null @@ -1,333 +0,0 @@ -import torch -import json - -from fltk.datasets.distributed import DistCIFAR10Dataset, DistCIFAR100Dataset, DistFashionMNISTDataset -from fltk.nets import Cifar10CNN, FashionMNISTCNN, Cifar100ResNet, FashionMNISTResNet, Cifar10ResNet, Cifar100VGG - -SEED = 1 -torch.manual_seed(SEED) - -class BareConfig: - - def __init__(self): - # self.logger = logger - - self.batch_size = 10 - self.test_batch_size = 1000 - self.epochs = 1 - self.lr = 0.001 - self.momentum = 0.9 - self.cuda = False - self.shuffle = False - self.log_interval = 10 - self.kwargs = {} - self.contribution_measurement_round = 1 - self.contribution_measurement_metric = 'Influence' - - self.scheduler_step_size = 50 - self.scheduler_gamma = 0.5 - self.min_lr = 1e-10 - - self.round_worker_selection_strategy = None - self.round_worker_selection_strategy_kwargs = None - - self.save_model = False - self.save_temp_model = False - self.save_epoch_interval = 1 - self.save_model_path = "models" - self.epoch_save_start_suffix = "start" - self.epoch_save_end_suffix = "end" - self.get_poison_effort = 'half' - self.num_workers = 50 - # self.num_poisoned_workers = 10 - - self.federator_host = '0.0.0.0' - self.rank = 0 - self.world_size = 0 - self.data_sampler = "uniform" - self.data_sampler_args = None - self.distributed = False - self.available_nets = { - "Cifar100ResNet": Cifar100ResNet, - "Cifar100VGG": Cifar100VGG, - "Cifar10CNN": Cifar10CNN, - "Cifar10ResNet": Cifar10ResNet, - "FashionMNISTCNN": FashionMNISTCNN, - "FashionMNISTResNet": FashionMNISTResNet - - } - self.net = None - self.set_net_by_name('Cifar10CNN') - self.dataset_name = 'cifar10' - - self.DistDatasets = { - 'cifar10': DistCIFAR10Dataset, - 'cifar100': DistCIFAR100Dataset, - 'fashion-mnist': DistFashionMNISTDataset, - } - self.train_data_loader_pickle_path = { - 'cifar10': 'data_loaders/cifar10/train_data_loader.pickle', - 'fashion-mnist': 'data_loaders/fashion-mnist/train_data_loader.pickle', - 'cifar100': 'data_loaders/cifar100/train_data_loader.pickle', - } - - self.test_data_loader_pickle_path = { - 'cifar10': 'data_loaders/cifar10/test_data_loader.pickle', - 'fashion-mnist': 'data_loaders/fashion-mnist/test_data_loader.pickle', - 'cifar100': 'data_loaders/cifar100/test_data_loader.pickle', - } - self.loss_function = torch.nn.CrossEntropyLoss - self.default_model_folder_path = "default_models" - self.data_path = "data" - - ########### - # Methods # - ########### - - def merge_yaml(self, cfg = {}): - """ - total_epochs: 20 - epochs_per_cycle: 2 - wait_for_clients: true - net: Cifar10CNN - dataset: cifar10 - experiment_prefix: 'experiment' - output_location: 'output' - tensor_board_active: true - :param yaml_config: - :return: - """ - if 'total_epochs' in cfg: - self.epochs = cfg['total_epochs'] - if 'epochs_per_cycle' in cfg: - self.epochs_per_cycle = cfg['epochs_per_cycle'] - if 'wait_for_clients' in cfg: - self.wait_for_clients = cfg['wait_for_clients'] - if 'net' in cfg: - self.set_net_by_name(cfg['net']) - if 'dataset' in cfg: - self.dataset_name = cfg['dataset'] - if 'experiment_prefix' in cfg: - self.experiment_prefix = cfg['experiment_prefix'] - if 'output_location' in cfg: - self.output_location = cfg['output_location'] - if 'tensor_board_active' in cfg: - self.tensor_board_active = cfg['tensor_board_active'] - if 'clients_per_round' in cfg: - self.clients_per_round = cfg['clients_per_round'] - if 'system' in cfg: - if 'clients' in cfg['system']: - if 'amount' in cfg['system']['clients']: - self.world_size = cfg['system']['clients']['amount'] + 1 - - if 'system' in cfg: - if 'federator' in cfg['system']: - if 'hostname' in cfg['system']['federator']: - self.federator_host = cfg['system']['federator']['hostname'] - if 'cuda' in cfg: - if cfg['cuda']: - self.cuda = True - else: - self.cuda = False - if 'sampler' in cfg: - self.data_sampler = cfg['sampler'] - if 'sampler_args' in cfg: - self.data_sampler_args = cfg['sampler_args'] - - - - def init_logger(self, logger): - self.logger = logger - - def get_distributed(self): - return self.distributed - - def get_rank(self): - return self.rank - - def get_world_size(self): - return self.world_size - - def set_sampler(self, sampler): - self.data_sampler = sampler - - def get_sampler(self): - return self.data_sampler - - def get_sampler_args(self): - return tuple(self.data_sampler_args) - - def get_round_worker_selection_strategy(self): - return self.round_worker_selection_strategy - - def get_round_worker_selection_strategy_kwargs(self): - return self.round_worker_selection_strategy_kwargs - - def set_round_worker_selection_strategy_kwargs(self, kwargs): - self.round_worker_selection_strategy_kwargs = kwargs - - def set_client_selection_strategy(self, strategy): - self.round_worker_selection_strategy = strategy - - def get_data_path(self): - return self.data_path - - def get_epoch_save_start_suffix(self): - return self.epoch_save_start_suffix - - def get_epoch_save_end_suffix(self): - return self.epoch_save_end_suffix - - def get_dataloader_list(self): - return list(self.train_data_loader_pickle_path.keys()) - - def get_nets_list(self): - return list(self.available_nets.keys()) - - def set_train_data_loader_pickle_path(self, path, name='cifar10'): - self.train_data_loader_pickle_path[name] = path - - def get_train_data_loader_pickle_path(self): - return self.train_data_loader_pickle_path[self.dataset_name] - - def set_test_data_loader_pickle_path(self, path, name='cifar10'): - self.test_data_loader_pickle_path[name] = path - - def get_test_data_loader_pickle_path(self): - return self.test_data_loader_pickle_path[self.dataset_name] - - def set_net_by_name(self, name: str): - self.net = self.available_nets[name] - - def get_cuda(self): - return self.cuda - - def get_scheduler_step_size(self): - return self.scheduler_step_size - - def get_scheduler_gamma(self): - return self.scheduler_gamma - - def get_min_lr(self): - return self.min_lr - - def get_default_model_folder_path(self): - return self.default_model_folder_path - - def get_num_epochs(self): - return self.epochs - - def set_num_poisoned_workers(self, num_poisoned_workers): - self.num_poisoned_workers = num_poisoned_workers - - def set_num_workers(self, num_workers): - self.num_workers = num_workers - - def set_model_save_path(self, save_model_path): - self.save_model_path = save_model_path - - def get_logger(self): - return self.logger - - def get_loss_function(self): - return self.loss_function - - def get_net(self): - return self.net - - def get_num_workers(self): - return self.num_workers - - def get_num_poisoned_workers(self): - return self.num_poisoned_workers - - def get_poison_effort(self): - return self.get_poison_effort - - def get_learning_rate(self): - return self.lr - - def get_momentum(self): - return self.momentum - - def get_shuffle(self): - return self.shuffle - - def get_batch_size(self): - return self.batch_size - - def get_test_batch_size(self): - return self.test_batch_size - - def get_log_interval(self): - return self.log_interval - - def get_save_model_folder_path(self): - return self.save_model_path - - def get_learning_rate_from_epoch(self, epoch_idx): - lr = self.lr * (self.scheduler_gamma ** int(epoch_idx / self.scheduler_step_size)) - - if lr < self.min_lr: - self.logger.warning("Updating LR would place it below min LR. Skipping LR update.") - - return self.min_lr - - self.logger.debug("LR: {}".format(lr)) - - return lr - - def get_contribution_measurement_round(self): - return self.contribution_measurement_round - - def get_contribution_measurement_metric(self): - return self.contribution_measurement_metric - - def should_save_model(self, epoch_idx): - """ - Returns true/false models should be saved. - - :param epoch_idx: current training epoch index - :type epoch_idx: int - """ - if not self.save_model: - return False - - if epoch_idx == 1 or epoch_idx % self.save_epoch_interval == 0: - return True - - def log(self): - """ - Log this arguments object to the logger. - """ - self.logger.debug("Arguments: {}", str(self)) - - def __str__(self): - return "\nBatch Size: {}\n".format(self.batch_size) + \ - "Test Batch Size: {}\n".format(self.test_batch_size) + \ - "Epochs: {}\n".format(self.epochs) + \ - "Learning Rate: {}\n".format(self.lr) + \ - "Momentum: {}\n".format(self.momentum) + \ - "CUDA Enabled: {}\n".format(self.cuda) + \ - "Shuffle Enabled: {}\n".format(self.shuffle) + \ - "Log Interval: {}\n".format(self.log_interval) + \ - "Scheduler Step Size: {}\n".format(self.scheduler_step_size) + \ - "Scheduler Gamma: {}\n".format(self.scheduler_gamma) + \ - "Scheduler Minimum Learning Rate: {}\n".format(self.min_lr) + \ - "Client Selection Strategy: {}\n".format(self.round_worker_selection_strategy) + \ - "Client Selection Strategy Arguments: {}\n".format( - json.dumps(self.round_worker_selection_strategy_kwargs, indent=4, sort_keys=True)) + \ - "Model Saving Enabled: {}\n".format(self.save_model) + \ - "Model Saving Interval: {}\n".format(self.save_epoch_interval) + \ - "Model Saving Path (Relative): {}\n".format(self.save_model_path) + \ - "Epoch Save Start Prefix: {}\n".format(self.epoch_save_start_suffix) + \ - "Epoch Save End Suffix: {}\n".format(self.epoch_save_end_suffix) + \ - "Number of Clients: {}\n".format(self.num_workers) + \ - "Number of Poisoned Clients: {}\n".format(self.num_poisoned_workers) + \ - "NN: {}\n".format(self.net) + \ - "Train Data Loader Path: {}\n".format(self.train_data_loader_pickle_path) + \ - "Test Data Loader Path: {}\n".format(self.test_data_loader_pickle_path) + \ - "Loss Function: {}\n".format(self.loss_function) + \ - "Default Model Folder Path: {}\n".format(self.default_model_folder_path) + \ - "Data Path: {}\n".format(self.data_path) + \ - "Dataset Name: {}\n".format(self.dataset_name) \ No newline at end of file diff --git a/fltk/util/cluster/__init__.py b/fltk/util/cluster/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/fltk/util/cluster/client.py b/fltk/util/cluster/client.py new file mode 100644 index 00000000..ce2bf0ce --- /dev/null +++ b/fltk/util/cluster/client.py @@ -0,0 +1,365 @@ +import logging +import time +from collections import defaultdict +from dataclasses import dataclass +from multiprocessing.pool import ThreadPool +from typing import Dict, List, Tuple, Optional +from uuid import UUID + +import schedule +from kubeflow.pytorchjob import V1PyTorchJob, V1ReplicaSpec, V1PyTorchJobSpec +from kubernetes import client +from kubernetes.client import V1ObjectMeta, V1ResourceRequirements, V1Container, V1PodTemplateSpec, \ + V1VolumeMount, V1Toleration, V1Volume, V1PersistentVolumeClaimVolumeSource + +from fltk.util.cluster.conversion import Convert +from fltk.util.config import BareConfig +from fltk.util.singleton import Singleton +from fltk.util.task.task import ArrivalTask + + +@dataclass +class Resource: + node_name: str + cpu_allocatable: int + memory_allocatable: int + cpu_requested: int + memory_requested: int + cpu_limit: int + memory_limit: int + + +class BuildDescription: + resources: V1ResourceRequirements + master_container: V1Container + worker_container: V1Container + master_template: V1PodTemplateSpec + worker_template: V1PodTemplateSpec + id: UUID + spec: V1PyTorchJobSpec + tolerations: List[V1Toleration] + + +class ResourceWatchDog: + """ + Class to be used to monitor the resources available within the cluster. For this the resource API is not needed, but + can be used to extend/speedup/prettify the implementation. The implementation is based on the work by @gorenje found + on GithHub: + + https://gist.github.com/gorenje/dff508489c3c8a460433ad709f14b7db + """ + _alive: False + _time: float = -1 + _node_lookup: Dict[str, client.V1Node] = dict() + _resource_lookup: Dict[str, Resource] = dict() + + def __init__(self): + """ + Work should be based on the details listed here: + https://github.com/scylladb/scylla-cluster-tests/blob/a7b09e69f0152a4d70bfb25ded3d75b7e7328acc/sdcm/cluster_k8s/__init__.py#L216-L223 + """ + self._v1: client.CoreV1Api + self._logger = logging.getLogger('ResourceWatchDog') + self._Q = Convert() + + def stop(self) -> None: + """ + Function to stop execution. The runner thread _should_ merge back to the thread pool after calling this function + to the thread pool. + @return: None + @rtype: None + """ + self._logger.info("[WatchDog] Received request to stop execution") + self._alive = False + + def start(self) -> None: + """ + Function to start the resource watch dog. Currently, it only monitors the per-node memory and cpu availability. + This does not handle event scheudling. + @return: None + @rtype: None + """ + self._logger.info("Starting resource watchdog") + self._alive = True + self._v1 = client.CoreV1Api() + self.__monitor_nodes() + + # Every 10 seconds we check the nodes with all the pods. + schedule.every(10).seconds.do(self.__monitor_pods).tag('node-monitoring') + # Every 1 minutes we check all the pods (in case the topology changes). + schedule.every(1).minutes.do(self.__monitor_pods).tag('pod-monitoring') + + self._logger.info("Starting with watching resources") + while self._alive: + schedule.run_pending() + time.sleep(1) + + def __monitor_nodes(self) -> None: + """ + Watchdog function that watches the Cluster resources in a K8s cluster. Requires the conf to be set and loaded + prior to calling. + @return: None + @rtype: None + """ + self._logger.info("Fetching node information of cluster...") + try: + node_list: client.V1NodeList = self._v1.list_node(watch=False) + self._node_lookup = {node.metadata.name: node for node in node_list.items} + if not self._alive: + self._logger.info("Instructed to stop, stopping list_node watch on Kubernetes.") + return + except Exception as e: + self._logger.error(e) + raise e + + def __monitor_pods(self) -> None: + """ + Function to monitor pod activity of currently listed pods. The available pods themselves are to be fetched + prior to calling this function. Stale pod information will result in incomplete update, as pods will be missed. + @return: None + @rtype: None + """ + node: client.V1Node + new_resource_mapper = {} + + self._logger.info("Fetching pod information of cluster...") + for node_name, node in self._node_lookup.items(): + try: + + # Create field selector to only get active pods that 'request' memory + selector = f'status.phase!=Succeeded,status.phase!=Failed,spec.nodeName={node_name}' + # Select pods from all namespaces on specific Kubernetes node + # try: + pod_list: client.V1PodList = self._v1.list_pod_for_all_namespaces(watch=False, field_selector=selector) + # Retrieve allocatable memory of node + alloc_cpu, alloc_mem = (self._Q(node.status.allocatable[item]) for item in ['cpu', 'memory']) + core_req, core_lim, mem_req, mem_lim = 0, 0, 0, 0 + for pod in pod_list.items: + for container in pod.spec.containers: + response = container.resources + reqs = defaultdict(lambda: 0, response.requests or {}) + lmts = defaultdict(lambda: 0, response.limits or {}) + core_req += self._Q(reqs["cpu"]) + mem_req += self._Q(reqs["memory"]) + core_lim += self._Q(lmts["cpu"]) + mem_lim += self._Q(lmts["memory"]) + resource = Resource(node_name, alloc_cpu, alloc_mem, core_req, mem_req, core_lim, mem_lim) + new_resource_mapper[node_name] = resource + except Exception as e: + self._logger.error(f'Namespace lookup for {node_name} failed. Reason: {e}') + + self._resource_lookup = new_resource_mapper + self._logger.debug(self._resource_lookup) + + +class ClusterManager(metaclass=Singleton): + """ + Object to potentially further extend. This shows how the information of different Pods in a cluster can be + requested and parsed. Currently, it mainly exists to start the ResourceWatchDog, which now only logs the amount of + resources... + """ + __alive = False + __threadpool: ThreadPool = None + + def __init__(self): + # When executing in a pod, load the incluster configuration according to + # https://github.com/kubernetes-client/python/blob/master/examples/in_cluster_config.py#L21 + self._v1 = client.CoreV1Api() + self._logger = logging.getLogger('ClusterManager') + self._watchdog = ResourceWatchDog() + + def start(self): + self._logger.info("Spinning up cluster manager...") + # Set debugging to WARNING only, as otherwise DEBUG statements will flood the logs. + client.rest.logger.setLevel(logging.WARNING) + self.__alive = True + self.__thread_pool = ThreadPool(processes=2) + self.__thread_pool.apply_async(self._watchdog.start) + self.__thread_pool.apply_async(self._run) + + def _stop(self): + self._logger.info("Stopping execution of ClusterManager, halting components...") + self._watchdog.stop() + self.__alive = False + self.__thread_pool.join() + self._logger.info("Successfully stopped execution of ClusterManager") + + def _run(self): + while self.__alive: + self._logger.info("Still alive...") + time.sleep(10) + + self._stop() + + +class DeploymentBuilder: + _buildDescription = BuildDescription() + + def reset(self) -> None: + self._buildDescription = BuildDescription() + + @staticmethod + def __resource_dict(mem: str, cpu: int) -> Dict[str, str]: + """ + Private helper function to create a resource dictionary for deployments. Currently only supports the creation + of the requests/limits directory that is needed for a V1ResoruceRequirements object. + @param mem: Memory Request/Limit for a Container's ResoruceRequirement + @type mem: + @param cpu: CPU Request/Limit for a Container's ResoruceRequirement + @type cpu: + + @return: + @rtype: + """ + return {'memory': mem, 'cpu': str(cpu)} + + def build_resources(self, arrival_task: ArrivalTask) -> None: + system_reqs = arrival_task.sys_conf + req_dict = self.__resource_dict(mem=system_reqs.executor_memory, + cpu=system_reqs.executor_cores) + # Currently the request is set to the limits. You may want to change this. + self._buildDescription.resources = client.V1ResourceRequirements(requests=req_dict, + limits=req_dict) + + def _generate_command(self, config: BareConfig, task: ArrivalTask): + command = (f'python3 -m fltk client {config.config_path} {task.id} ' + f'--model {task.network} --dataset {task.dataset} ' + f'--optimizer Adam --max_epoch {task.param_conf.max_epoch} ' + f'--batch_size {task.param_conf.bs} --learning_rate {task.param_conf.lr} ' + f'--decay {task.param_conf.lr_decay} --loss CrossEntropy ' + f'--backend gloo') + return command.split(' ') + + def _build_container(self, conf: BareConfig, task: ArrivalTask, name: str = "pytorch", + vol_mnts: List[V1VolumeMount] = None) -> V1Container: + return V1Container( + name=name, + image=conf.cluster_config.image, + command=self._generate_command(conf, task), + image_pull_policy='Always', + # Set the resources to the pre-generated resources + resources=self._buildDescription.resources, + volume_mounts=vol_mnts + ) + + def build_worker_container(self, conf: BareConfig, task: ArrivalTask, name: str = "pytorch") -> None: + self._buildDescription.worker_container = self._build_container(conf, task, name) + + def build_master_container(self, conf: BareConfig, task: ArrivalTask, name: str = "pytorch") -> None: + """ + Function to build the Master worker container. This requires the LOG PV to be mounted on the expected + logging directory. Make sure that any changes in the Helm charts are also reflected here. + @param image: + @type image: + @param name: + @type name: + @return: + @rtype: + """ + master_mounts: List[V1VolumeMount] = [V1VolumeMount( + mount_path=f'/opt/federation-lab/{conf.get_log_dir()}', + name='fl-log-claim', + read_only=False + )] + self._buildDescription.master_container = self._build_container(conf, task, name, master_mounts) + + def build_container(self, task: ArrivalTask, conf: BareConfig): + self.build_master_container(conf, task) + self.build_worker_container(conf, task) + + def build_tolerations(self, tols: List[Tuple[str, Optional[str], str, str]] = None): + if not tols: + self._buildDescription.tolerations = [ + V1Toleration(key="fltk.node", + operator="Exists", + effect="NoSchedule")] + else: + self._buildDescription.tolerations = \ + [V1Toleration(key=key, value=vl, operator=op, effect=effect) for key, vl, op, effect in tols] + + def build_template(self) -> None: + """ + + @return: + @rtype: + """ + # TODO: Add support for tolerations to use only affinitity nodes to deploy to... + # Ensure with taints that + # https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/ + + master_volumes = \ + [V1Volume(name="fl-log-claim", + persistent_volume_claim=V1PersistentVolumeClaimVolumeSource(claim_name='fl-log-claim')) + ] + + self._buildDescription.master_template = client.V1PodTemplateSpec( + metadata=client.V1ObjectMeta(labels={"app": "fltk-worker"}), + spec=client.V1PodSpec(containers=[self._buildDescription.master_container], + volumes=master_volumes, + tolerations=self._buildDescription.tolerations)) + self._buildDescription.worker_template = client.V1PodTemplateSpec( + metadata=client.V1ObjectMeta(labels={"app": "fltk-worker"}), + spec=client.V1PodSpec(containers=[self._buildDescription.worker_container], + tolerations=self._buildDescription.tolerations)) + + def build_spec(self, task: ArrivalTask, restart_policy: str = 'OnFailure') -> None: + master_repl_spec = V1ReplicaSpec( + replicas=1, + restart_policy=restart_policy, + template=self._buildDescription.master_template) + master_repl_spec.openapi_types = master_repl_spec.swagger_types + pt_rep_spec: Dict[str, V1ReplicaSpec] = {"Master": master_repl_spec} + parallelism = int(task.sys_conf.data_parallelism) + if parallelism > 1: + worker_repl_spec = V1ReplicaSpec( + replicas=parallelism - 1, + restart_policy=restart_policy, + template=self._buildDescription.worker_template + ) + worker_repl_spec.openapi_types = worker_repl_spec.swagger_types + pt_rep_spec['Worker'] = worker_repl_spec + + job_spec = V1PyTorchJobSpec(pytorch_replica_specs=pt_rep_spec) + job_spec.openapi_types = job_spec.swagger_types + self._buildDescription.spec = job_spec + + def construct(self) -> V1PyTorchJob: + """ + Contruct V1PyTorch object following the description of the building process. Note that V1PyTorchJob differs + slightly from a V1Job object in Kubernetes. Refer to the kubeflow documentation for more information on the + PV1PyTorchJob object. + @return: V1PyTorchJob object that was dynamically constructed. + @rtype: V1PyTorchJob + """ + job = V1PyTorchJob( + api_version="kubeflow.org/v1", + kind="PyTorchJob", + metadata=V1ObjectMeta(name=f'trainjob-{self._buildDescription.id}', namespace='test'), + spec=self._buildDescription.spec) + return job + + def create_identifier(self, task: ArrivalTask): + self._buildDescription.id = task.id + + +def construct_job(conf: BareConfig, task: ArrivalTask) -> V1PyTorchJob: + """ + Function to build a Job, based on the specifications of an ArrivalTask, and the general configuration of the + BareConfig. + @param conf: configuration object that contains specifics to properly start a client. + @type conf: BareConfig + @param task: Learning task for which a job description must be made. + @type task: ArrivalTask + @return: KubeFlow compatible PyTorchJob description to create a Job with the requested system and hyper parameters. + @rtype: V1PyTorchJob + """ + dp_builder = DeploymentBuilder() + dp_builder.create_identifier(task) + dp_builder.build_resources(task) + dp_builder.build_container(task, conf) + dp_builder.build_tolerations() + dp_builder.build_template() + dp_builder.build_spec(task) + job = dp_builder.construct() + job.openapi_types = job.swagger_types + return job diff --git a/fltk/util/cluster/conversion.py b/fltk/util/cluster/conversion.py new file mode 100644 index 00000000..3187720f --- /dev/null +++ b/fltk/util/cluster/conversion.py @@ -0,0 +1,30 @@ +from pathlib import Path +from typing import Union + +from pint import UnitRegistry + + +class Convert: + """ + Conversion class, wrapper around pint UnitRegistry. Assumes that the active path is set to the project root. + Otherwise, provide a custom path to the conversion file when called from a different directory. + """ + + CONVERSION_PATH = Path('configs/quantities/kubernetes.conf') + + def __init__(self, path: Path = None): + if path: + self.__Registry = UnitRegistry(filename=str(path)) + else: + self.__Registry = UnitRegistry(filename=str(self.CONVERSION_PATH)) + + def __call__(self, value: Union[str, int]) -> int: + """ + Function to convert str representation of a CPU/memory quantity into an integer representation. For conversion + metrics see `/configs/quantities/kubernetes.conf` + @param value: String representation of CPU/memory to be converted to quantity. + @type value: str + @return: Integer representation of CPU/memory quantity that was provided by the caller. + @rtype: int + """ + return self.__Registry.Quantity(value) diff --git a/fltk/util/config/__init__.py b/fltk/util/config/__init__.py new file mode 100644 index 00000000..bdcc8f70 --- /dev/null +++ b/fltk/util/config/__init__.py @@ -0,0 +1 @@ +from .base_config import * diff --git a/fltk/util/config/arguments.py b/fltk/util/config/arguments.py new file mode 100644 index 00000000..79ce2490 --- /dev/null +++ b/fltk/util/config/arguments.py @@ -0,0 +1,153 @@ +from argparse import Namespace +from dataclasses import dataclass +from typing import List, Tuple, Type, Dict, T + +import torch.distributed as dist +import torch.nn + +import fltk.nets as nets +from fltk.datasets import CIFAR10Dataset, FashionMNISTDataset, CIFAR100Dataset, MNIST +from fltk.datasets.dataset import Dataset + +CLIENT_ARGS: List[Tuple[str, str, str, type]] = \ + [("model", "md", "Which model to train", str), + ("dataset", "ds", "Which dataset to train the model on", str), + ("batch_size", "bs", + "Number that are 'batched' together in a single forward/backward pass during the optimization steps.", int), + ("max_epoch", "ep", + "Maximum number of times that the 'training' set instances can be used during the optimization steps", int), + ("learning_rate", "lr", "Factor to limit the step size that is taken during each gradient descent step.", float), + ("decay", 'dc', + "Rate at which the learning rate decreases (i.e. the optimization takes smaller steps", float), + ("loss", 'ls', "Loss function to use for optimization steps", str), + ("optimizer", 'op', "Which optimizer to use during the training process", str) + ] + + +@dataclass(frozen=True) +class LearningParameters: + model: str + dataset: str + batch_size: int + max_epoch: int + learning_rate: float + learning_decay: float + loss: str + optimizer: str + + _available_nets = { + "CIFAR100RESNET": nets.Cifar100ResNet, + "CIFAR100VGG": nets.Cifar100VGG, + "CIFAR10CNN": nets.Cifar10CNN, + "CIFAR10RESNET": nets.Cifar10ResNet, + "FASHIONMNISTCNN": nets.FashionMNISTCNN, + "FASHIONMNISTRESNET": nets.FashionMNISTResNet + } + + _available_data = { + "CIFAR10": CIFAR10Dataset, + "CIFAR100": CIFAR100Dataset, + "FASHIONMNIST": FashionMNISTDataset, + "MNIST": MNIST + } + + _available_loss = { + "CROSSENTROPY": torch.nn.CrossEntropyLoss + } + + _available_optimizer = { + "ADAM": torch.optim.SGD + } + + @staticmethod + def __safe_get(lookup: Dict[str, T], keyword: str) -> T: + """ + Static function to 'safe' get elements from a dictionary, to prevent issues with Capitalization in the code. + @param lookup: Lookup dictionary to 'safe get' from. + @type lookup: dict + @param keyword: Keyword to 'get' from the Lookup dictionary. + @type keyword: str + @return: Lookup value from 'safe get' request. + @rtype: T + """ + safe_keyword = str.upper(keyword) + return lookup.get(safe_keyword) + + def get_model_class(self) -> Type[torch.nn.Module]: + """ + Function to obtain the model class that was given via commandline. + @return: Type corresponding to the model that was passed as argument. + @rtype: Type[torch.nn.Module] + """ + return self.__safe_get(self._available_nets, self.model) + + def get_dataset_class(self) -> Type[Dataset]: + """ + Function to obtain the dataset class that was given via commandline. + @return: Type corresponding to the dataset that was passed as argument. + @rtype: Type[Dataset] + """ + return self.__safe_get(self._available_data, self.dataset) + + def get_loss(self) -> Type: + """ + Function to obtain the loss function Type that was given via commandline to be used during the training + execution. + @return: Type corresponding to the loss function that was passed as argument. + @rtype: Type + """ + return self.__safe_get(self._available_loss, self.loss) + + def get_optimizer(self) -> Type[torch.optim.Optimizer]: + """ + Function to obtain the loss function Type that was given via commandline to be used during the training + execution. + @return: Type corresponding to the Optimizer to be used during training. + @rtype: Type[torch.optim.Optimizer] + """ + return self.__safe_get(self._available_optimizer, self.optimizer) + + +def extract_learning_parameters(args: Namespace) -> LearningParameters: + """ + Function to extract the learning hyper-parameters from the Namespace object for the passed arguments. + @param args: Namespace environment for running the Client. + @type args: Namespace + @return: Parsed learning parameters. + @rtype: LearningParameters + """ + model = args.model + dataset = args.dataset + batch_size = args.batch_size + epoch = args.max_epoch + lr = args.learning_rate + decay = args.decay + loss = args.loss + optimizer = args.optimizer + return LearningParameters(model, dataset, batch_size, epoch, lr, decay, loss, optimizer) + + +def create_extractor_parser(subparsers): + extractor_parser = subparsers.add_parser('extractor') + extractor_parser.add_argument('config', type=str) + + +def create_client_parser(subparsers) -> None: + client_parser = subparsers.add_parser('client') + client_parser.add_argument('config', type=str) + client_parser.add_argument('task_id', type=str) + + # Add hyper-parameters + for long, short, hlp, tpe in CLIENT_ARGS: + client_parser.add_argument(f'-{short}', f'--{long}', type=tpe, help=hlp, required=True) + + # Add parameter parser for backend + client_parser.add_argument('--backend', type=str, help='Distributed backend', + choices=[dist.Backend.GLOO, dist.Backend.NCCL, dist.Backend.MPI], + default=dist.Backend.GLOO) + + +def create_cluster_parser(subparsers) -> None: + cluster_parser = subparsers.add_parser('cluster') + cluster_parser.add_argument('config', type=str) + cluster_parser.add_argument('-l', '--local', type=bool, default=False) diff --git a/fltk/util/config/base_config.py b/fltk/util/config/base_config.py new file mode 100644 index 00000000..25349406 --- /dev/null +++ b/fltk/util/config/base_config.py @@ -0,0 +1,243 @@ +import os +from dataclasses import dataclass, field +from pathlib import Path + +from dataclasses_json import config, dataclass_json + +from fltk.nets.util.reproducability import init_reproducibility + + +@dataclass_json +@dataclass +class GeneralNetConfig: + save_model: bool = False + save_temp_model: bool = False + save_epoch_interval: int = 1 + save_model_path: str = 'models' + epoch_save_start_suffix: str = 'cloud_experiment' + epoch_save_end_suffix: str = 'cloud_experiment' + scheduler_step_size = 50 + scheduler_gamma = 0.5 + min_lr = 1e-10 + + +@dataclass_json +@dataclass(frozen=True) +class ReproducibilityConfig: + torch_seed: int + arrival_seed: int + + +@dataclass_json +@dataclass(frozen=True) +class TensorboardConfig: + active: bool + record_dir: str + + def prepare_log_dir(self, working_dir: Path = None): + """ + Function to create logging directory used by TensorBoard. When running in a cluster, this function should not be + used, as the TensorBoard instance that is started simultaneously with the Orchestrator. + @param working_dir: Current working directory, by default PWD is assumed at which the Python interpreter is + started. + @type working_dir: Path + @return: None + @rtype: None + """ + dir_to_check = Path(self.record_dir) + if working_dir: + dir_to_check = working_dir.joinpath(dir_to_check) + if not dir_to_check.exists() and dir_to_check.parent.is_dir(): + dir_to_check.mkdir() + + +@dataclass_json +@dataclass +class ExecutionConfig: + general_net: GeneralNetConfig = field(metadata=config(field_name="net")) + reproducibility: ReproducibilityConfig + tensorboard: TensorboardConfig + + duration: int + experiment_prefix: str = "experiment" + cuda: bool = False + default_model_folder_path = "default_models" + epoch_save_end_suffix = "epoch_end" + save_model_path = "models" + data_path = "data" + log_path = "logging" + + +@dataclass_json +@dataclass +class OrchestratorConfig: + service: str + nic: str + + +@dataclass_json +@dataclass +class ClientConfig: + prefix: str + tensorboard_active: bool + + +@dataclass_json +@dataclass +class ClusterConfig: + orchestrator: OrchestratorConfig + client: ClientConfig + wait_for_clients: bool = True + namespace: str = 'test' + image: str = 'fltk:latest' + + def load_incluster_namespace(self): + with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace") as f: + current_namespace = f.read() + self.namespace = current_namespace + + def load_incluster_image(self): + """ + Function to load the in-cluster image. The fltk-values.yaml file in charts is expected to have (at least) the + following contents. The default Helm chart contains the necessary options to set this correctly. + + provider: + domain: gcr.io + projectName: + imageName: fltk:latest + + @return: None + @rtype: None + """ + self.image = os.environ.get('IMAGE_NAME') + + +@dataclass_json +@dataclass +class BareConfig(object): + execution_config: ExecutionConfig + cluster_config: ClusterConfig = field(metadata=config(field_name="cluster")) + config_path: Path = None + + def set_seed(self) -> None: + """ + Set seeds for better reproducibility, and prevent testing random initialization of the model, + i.e. 'lucky draws' in network initialization. + @return: None + @rtype: None + """ + init_reproducibility( + torch_seed=self.execution_config.reproducibility.torch_seed, + cuda=self.execution_config.cuda, + numpy_seed=self.execution_config.reproducibility.arrival_seed + ) + + def get_duration(self) -> int: + """ + Function to get execution duration of an experiment. + @return: Integer representation of seconds for which the experiments must be run. + @rtype: int + """ + return self.execution_config.duration + + def get_log_dir(self): + """ + Function to get the logging directory from the configuration. + @return: path object to the logging directory. + @rtype: Path + """ + return self.execution_config.log_path + + def get_log_path(self, experiment_id: str, client_id: int, network_name: str) -> Path: + """ + Function to get the logging path that corresponds to a specific experiment, client and network that has been + deployed as learning task. + @param experiment_id: Unique experiment ID (should be provided by the Orchestrator). + @type experiment_id: str + @param client_id: Rank of the client. + @type client_id: int + @param network_name: Name of the network that is to be trained. + @type network_name: str + @return: Path representation of the directory/path should be logged by the training process. + @rtype: Path + """ + base_log = Path(self.execution_config.tensorboard.record_dir) + experiment_dir = Path(f"{self.execution_config.experiment_prefix}_{client_id}_{network_name}_{experiment_id}") + return base_log.joinpath(experiment_dir) + + def get_scheduler_step_size(self) -> int: + """ + Function to get the step_size of the Learning Rate decay scheduler/ + @return: Learning rate scheduler step-size. + @rtype: int + """ + return self.execution_config.general_net.scheduler_step_size + + def get_scheduler_gamma(self) -> float: + """ + Function to get multiplication factor for LR update from config. + @return: Multiplication factor for LR update + @rtype: float + """ + return self.execution_config.general_net.scheduler_gamma + + def get_min_lr(self) -> float: + """ + Function to get the minimum learning rate from config. + @return: Minimum learning rate of training process. + @rtype: float + """ + return self.execution_config.general_net.min_lr + + def get_data_path(self) -> Path: + """ + Function to get the data path from config. + @return: Path representation to where data can be written. + @rtype: Path + """ + return Path(self.execution_config.data_path) + + def get_default_model_folder_path(self) -> Path: + """ + @deprecated Function to get the default model folder path from Config, needed for non-default training in the + FLTK framework. + @return: Path representation of model path. + @rtype: Path + """ + return Path(self.execution_config.default_model_folder_path) + + def cuda_enabled(self) -> bool: + """ + Function to check CUDA availability independent of BareConfig structure. + @return: True when CUDA should be used, False otherwise. + @rtype: bool + """ + return self.execution_config.cuda + + def should_save_model(self, epoch_idx) -> bool: + """ + @deprecated Returns true/false models should be saved. + + @param epoch_idx: current training epoch index + @type epoch_idx: int + @return: Boolean indication of whether the model should be saved + @rtype: bool + """ + return self.execution_config.general_net.save_model and ( + epoch_idx == 1 or epoch_idx % self.execution_config.general_net.save_epoch_interval == 0) + + def get_epoch_save_end_suffix(self) -> str: + """ + Function to gather the end suffix for saving after running an epoch. + @return: Suffix for saving epoch data. + @rtype: str + """ + return self.execution_config.epoch_save_end_suffix + + def get_save_model_folder_path(self) -> Path: + """ + Function to get save path for a model. + @return: Path to where the model should be saved. + @rtype: Path + """ + return Path(self.execution_config.save_model_path) diff --git a/fltk/util/data_loader_utils.py b/fltk/util/data_loader_utils.py index d01cf3a4..af58449c 100644 --- a/fltk/util/data_loader_utils.py +++ b/fltk/util/data_loader_utils.py @@ -1,10 +1,12 @@ -import numpy -from torch.utils.data import DataLoader - import os import pickle import random -from ..datasets.distributed import DistDataset as Dataset + +import numpy +from torch.utils.data import DataLoader + +from fltk.datasets.dataset import Dataset + def generate_data_loaders_from_distributed_dataset(distributed_dataset, batch_size): """ @@ -17,10 +19,13 @@ def generate_data_loaders_from_distributed_dataset(distributed_dataset, batch_si """ data_loaders = [] for worker_training_data in distributed_dataset: - data_loaders.append(Dataset.get_data_loader_from_data(batch_size, worker_training_data[0], worker_training_data[1], shuffle=True)) + data_loaders.append( + Dataset.get_data_loader_from_data(batch_size, worker_training_data[0], worker_training_data[1], + shuffle=True)) return data_loaders + def load_train_data_loader(logger, args): """ Loads the training data DataLoader object from a file if available. @@ -36,12 +41,14 @@ def load_train_data_loader(logger, args): raise FileNotFoundError("Couldn't find train data loader stored in file") + def generate_train_loader(args, dataset): train_dataset = dataset.get_train_dataset() X, Y = shuffle_data(args, train_dataset) return dataset.get_data_loader_from_data(args.get_batch_size(), X, Y) + def load_test_data_loader(logger, args): """ Loads the test data DataLoader object from a file if available. @@ -56,6 +63,7 @@ def load_test_data_loader(logger, args): raise FileNotFoundError("Couldn't find train data loader stored in file") + def load_data_loader_from_file(logger, filename) -> DataLoader: """ Loads DataLoader object from a file if available. @@ -68,12 +76,14 @@ def load_data_loader_from_file(logger, filename) -> DataLoader: with open(filename, "rb") as f: return load_saved_data_loader(f) + def generate_test_loader(args, dataset): test_dataset = dataset.get_test_dataset() X, Y = shuffle_data(args, test_dataset) return dataset.get_data_loader_from_data(args.get_test_batch_size(), X, Y) + def shuffle_data(args, dataset): data = list(zip(dataset[0], dataset[1])) random.shuffle(data) @@ -83,8 +93,10 @@ def shuffle_data(args, dataset): return X, Y + def load_saved_data_loader(file_obj): return pickle.load(file_obj) + def save_data_loader_to_file(data_loader, file_obj): pickle.dump(data_loader, file_obj) diff --git a/fltk/util/default_models.py b/fltk/util/default_models.py deleted file mode 100644 index eda04fb3..00000000 --- a/fltk/util/default_models.py +++ /dev/null @@ -1,47 +0,0 @@ - -import os -import torch -import logging -logging.basicConfig(level=logging.DEBUG) -from fltk.nets import Cifar10CNN, FashionMNISTCNN, Cifar100ResNet, FashionMNISTResNet, Cifar10ResNet, Cifar100VGG -from fltk.util.arguments import Arguments - -if __name__ == '__main__': - args = Arguments(logging) - if not os.path.exists(args.get_default_model_folder_path()): - os.mkdir(args.get_default_model_folder_path()) - - # --------------------------------- - # ----------- Cifar10CNN ---------- - # --------------------------------- - full_save_path = os.path.join(args.get_default_model_folder_path(), "Cifar10CNN.model") - torch.save(Cifar10CNN().state_dict(), full_save_path) - # --------------------------------- - # --------- Cifar10ResNet --------- - # --------------------------------- - full_save_path = os.path.join(args.get_default_model_folder_path(), "Cifar10ResNet.model") - torch.save(Cifar10ResNet().state_dict(), full_save_path) - - # --------------------------------- - # -------- FashionMNISTCNN -------- - # --------------------------------- - full_save_path = os.path.join(args.get_default_model_folder_path(), "FashionMNISTCNN.model") - torch.save(FashionMNISTCNN().state_dict(), full_save_path) - - # --------------------------------- - # ------ FashionMNISTResNet ------- - # --------------------------------- - full_save_path = os.path.join(args.get_default_model_folder_path(), "FashionMNISTResNet.model") - torch.save(FashionMNISTResNet().state_dict(), full_save_path) - - # --------------------------------- - # ----------- Cifar100CNN --------- - # --------------------------------- - full_save_path = os.path.join(args.get_default_model_folder_path(), "Cifar100ResNet.model") - torch.save(Cifar100ResNet().state_dict(), full_save_path) - - # --------------------------------- - # ----------- Cifar100VGG --------- - # --------------------------------- - full_save_path = os.path.join(args.get_default_model_folder_path(), "Cifar100VGG.model") - torch.save(Cifar100VGG().state_dict(), full_save_path) \ No newline at end of file diff --git a/fltk/util/generate_data_distribution.py b/fltk/util/generate_data_distribution.py deleted file mode 100644 index 34135c0b..00000000 --- a/fltk/util/generate_data_distribution.py +++ /dev/null @@ -1,70 +0,0 @@ -import pathlib -import os -import logging - -from fltk.datasets import CIFAR10Dataset, FashionMNISTDataset, CIFAR100Dataset -from fltk.util.arguments import Arguments -from fltk.util.data_loader_utils import generate_train_loader, generate_test_loader, save_data_loader_to_file - -logging.basicConfig(level=logging.DEBUG) - - -if __name__ == '__main__': - args = Arguments(logging) - - # --------------------------------- - # ------------ CIFAR10 ------------ - # --------------------------------- - dataset = CIFAR10Dataset(args) - TRAIN_DATA_LOADER_FILE_PATH = "data_loaders/cifar10/train_data_loader.pickle" - TEST_DATA_LOADER_FILE_PATH = "data_loaders/cifar10/test_data_loader.pickle" - - if not os.path.exists("data_loaders/cifar10"): - pathlib.Path("data_loaders/cifar10").mkdir(parents=True, exist_ok=True) - - train_data_loader = generate_train_loader(args, dataset) - test_data_loader = generate_test_loader(args, dataset) - - with open(TRAIN_DATA_LOADER_FILE_PATH, "wb") as f: - save_data_loader_to_file(train_data_loader, f) - - with open(TEST_DATA_LOADER_FILE_PATH, "wb") as f: - save_data_loader_to_file(test_data_loader, f) - - # --------------------------------- - # --------- Fashion-MNIST --------- - # --------------------------------- - dataset = FashionMNISTDataset(args) - TRAIN_DATA_LOADER_FILE_PATH = "data_loaders/fashion-mnist/train_data_loader.pickle" - TEST_DATA_LOADER_FILE_PATH = "data_loaders/fashion-mnist/test_data_loader.pickle" - - if not os.path.exists("data_loaders/fashion-mnist"): - pathlib.Path("data_loaders/fashion-mnist").mkdir(parents=True, exist_ok=True) - - train_data_loader = generate_train_loader(args, dataset) - test_data_loader = generate_test_loader(args, dataset) - - with open(TRAIN_DATA_LOADER_FILE_PATH, "wb") as f: - save_data_loader_to_file(train_data_loader, f) - - with open(TEST_DATA_LOADER_FILE_PATH, "wb") as f: - save_data_loader_to_file(test_data_loader, f) - - # --------------------------------- - # ------------ CIFAR100 ----------- - # --------------------------------- - dataset = CIFAR100Dataset(args) - TRAIN_DATA_LOADER_FILE_PATH = "data_loaders/cifar100/train_data_loader.pickle" - TEST_DATA_LOADER_FILE_PATH = "data_loaders/cifar100/test_data_loader.pickle" - - if not os.path.exists("data_loaders/cifar100"): - pathlib.Path("data_loaders/cifar100").mkdir(parents=True, exist_ok=True) - - train_data_loader = generate_train_loader(args, dataset) - test_data_loader = generate_test_loader(args, dataset) - - with open(TRAIN_DATA_LOADER_FILE_PATH, "wb") as f: - save_data_loader_to_file(train_data_loader, f) - - with open(TEST_DATA_LOADER_FILE_PATH, "wb") as f: - save_data_loader_to_file(test_data_loader, f) diff --git a/fltk/util/generate_docker_compose.py b/fltk/util/generate_docker_compose.py deleted file mode 100644 index 52a7bb36..00000000 --- a/fltk/util/generate_docker_compose.py +++ /dev/null @@ -1,59 +0,0 @@ -import sys -import yaml -import copy - -template_path = './deploy/templates' - -def load_system_template(): - with open(f'{template_path}/system_stub.yml') as file: - documents = yaml.full_load(file) - return documents - -def load_client_template(type='default'): - with open(f'{template_path}/client_stub_{type}.yml') as file: - documents = yaml.full_load(file) - return documents - -def generate_client(id, template: dict, world_size: int, type='default'): - local_template = copy.deepcopy(template) - key_name = list(local_template.keys())[0] - container_name = f'client_{type}_{id}' - local_template[container_name] = local_template.pop(key_name) - for key, item in enumerate(local_template[container_name]['environment']): - if item == 'RANK={rank}': - local_template[container_name]['environment'][key] = item.format(rank=id) - if item == 'WORLD_SIZE={world_size}': - local_template[container_name]['environment'][key] = item.format(world_size=world_size) - - local_template[container_name]['ports'] = [f'{5000+id}:5000'] - return local_template, container_name - - -def generate(num_clients: int): - world_size = num_clients + 1 - system_template :dict = load_system_template() - - for key, item in enumerate(system_template['services']['fl_server']['environment']): - if item == 'WORLD_SIZE={world_size}': - system_template['services']['fl_server']['environment'][key] = item.format(world_size=world_size) - - for client_id in range(1, num_clients+1): - client_type = 'default' - if client_id == 1: - client_type='slow' - if client_id == 2: - client_type='medium' - client_template: dict = load_client_template(type=client_type) - client_definition, container_name = generate_client(client_id, client_template, world_size, type=client_type) - system_template['services'].update(client_definition) - - with open(r'./docker-compose.yml', 'w') as file: - yaml.dump(system_template, file, sort_keys=False) - - -if __name__ == '__main__': - - num_clients = int(sys.argv[1]) - generate(num_clients) - print('Done') - diff --git a/fltk/util/iid_equal.py b/fltk/util/iid_equal.py deleted file mode 100644 index c47bcc16..00000000 --- a/fltk/util/iid_equal.py +++ /dev/null @@ -1,19 +0,0 @@ -import torch - -def distribute_batches_equally(train_data_loader, num_workers): - """ - Gives each worker the same number of batches of training data. - - :param train_data_loader: Training data loader - :type train_data_loader: torch.utils.data.DataLoader - :param num_workers: number of workers - :type num_workers: int - """ - distributed_dataset = [[] for i in range(num_workers)] - - for batch_idx, (data, target) in enumerate(train_data_loader): - worker_idx = batch_idx % num_workers - - distributed_dataset[worker_idx].append((data, target)) - - return distributed_dataset diff --git a/fltk/util/label_replacement.py b/fltk/util/label_replacement.py deleted file mode 100644 index 1b09d18c..00000000 --- a/fltk/util/label_replacement.py +++ /dev/null @@ -1,12 +0,0 @@ -def apply_class_label_replacement(X, Y, replacement_method): - """ - Replace class labels using the replacement method - - :param X: data features - :type X: numpy.Array() - :param Y: data labels - :type Y: numpy.Array() - :param replacement_method: Method to update targets - :type replacement_method: method - """ - return (X, replacement_method(Y, set(Y))) diff --git a/fltk/util/log.py b/fltk/util/log.py deleted file mode 100644 index a80661a3..00000000 --- a/fltk/util/log.py +++ /dev/null @@ -1,9 +0,0 @@ -import logging - -from torch.distributed import rpc - -class FLLogger: - @staticmethod - @rpc.functions.async_execution - def log(arg1, node_id, log_line, report_time): - logging.info(f'[{node_id}: {report_time}]: {log_line}') \ No newline at end of file diff --git a/fltk/util/results.py b/fltk/util/results.py index af560479..9c3333f2 100644 --- a/fltk/util/results.py +++ b/fltk/util/results.py @@ -1,5 +1,7 @@ from dataclasses import dataclass -from typing import Any + +import numpy as np + @dataclass class EpochData: @@ -9,12 +11,7 @@ class EpochData: loss_train: float accuracy: float loss: float - class_precision: Any - class_recall: Any + class_precision: np.array + class_recall: np.array + confusion_mat: np.array client_id: str = None - - def to_csv_line(self): - delimeter = ',' - values = self.__dict__.values() - values = [str(x) for x in values] - return delimeter.join(values) diff --git a/fltk/util/singleton.py b/fltk/util/singleton.py new file mode 100644 index 00000000..d7f4d26a --- /dev/null +++ b/fltk/util/singleton.py @@ -0,0 +1,12 @@ +import threading + +class Singleton(type): + _lock = threading.Lock() + _instances = {} + + def __call__(cls, *args, **kwargs): + if cls not in cls._instances: + with cls._lock: + if cls not in cls._instances: + cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) + return cls._instances[cls] \ No newline at end of file diff --git a/fltk/util/task/__init__.py b/fltk/util/task/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/fltk/util/task/config/__init__.py b/fltk/util/task/config/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/fltk/util/task/config/parameter.py b/fltk/util/task/config/parameter.py new file mode 100644 index 00000000..9cfb082d --- /dev/null +++ b/fltk/util/task/config/parameter.py @@ -0,0 +1,134 @@ +import json +from dataclasses import dataclass, field +from pathlib import Path +from typing import List + +from dataclasses_json import config, dataclass_json + + +@dataclass_json +@dataclass(frozen=True) +class HyperParameters: + """ + Learning HyperParameters. + + bs: Number of images that are used during each forward/backward phase. + max_epoch: Number of times epochs are executed. + lr: Learning rate parameter, limiting the step size in the gradient update. + lr_decay: How fast the learning rate 'shrinks'. + """ + bs: int = field(metadata=config(field_name="batchSize")) + max_epoch: int = field(metadata=config(field_name="maxEpoch")) + lr: str = field(metadata=config(field_name="learningRate")) + lr_decay: str = field(metadata=config(field_name="learningrateDecay")) + +@dataclass_json +@dataclass(frozen=True) +class Priority: + """ + Job class priority, indicating the presedence of one arrival over another. + """ + priority: int + probability: float + + +@dataclass_json +@dataclass(frozen=True) +class SystemParameters: + """ + System parameters to spawn pods with. + data_parallelism: Number of pods (distributed) that will work together on training the network. + executor_cores: Number of cores assigned to each executor. + executor_memory: Amount of RAM allocated to each executor. + action: Indicating whether it regards 'inference' or 'train'ing time. + """ + data_parallelism: int = field(metadata=config(field_name="dataParallelism")) + executor_cores: int = field(metadata=config(field_name="executorCores")) + executor_memory: str = field(metadata=config(field_name="executorMemory")) + action: str = field(metadata=config(field_name="action")) + + +@dataclass_json +@dataclass(frozen=True) +class NetworkConfiguration: + """ + Dataclass describing the network and dataset that is 'trained' for a task. + """ + network: str + dataset: str + + +@dataclass_json +@dataclass(frozen=True) +class JobClassParameter: + """ + Dataclass describing the job specific parameters (system and hyper). + """ + network_configuration: NetworkConfiguration = field(metadata=config(field_name="networkConfiguration")) + system_parameters: SystemParameters = field(metadata=config(field_name="systemParameters")) + hyper_parameters: HyperParameters = field(metadata=config(field_name="hyperParameters")) + class_probability: float = field(metadata=config(field_name="classProbability")) + priorities: List[Priority] = field(metadata=config(field_name="priorities")) + + +@dataclass_json +@dataclass(frozen=True) +class JobDescription: + """ + Dataclass describing the characteristics of a Job type, as well as the corresponding arrival statistic. + Currently, the arrival statistics is the lambda value used in a Poisson arrival process. + + preemtible_jobs: indicates whether the jobs can be pre-emptively rescheduled by the scheduler. This is currently + not implemented in FLTK, but could be added as a project (advanced) + """ + job_class_parameters: List[JobClassParameter] = field(metadata=config(field_name="jobClassParameters")) + arrival_statistic: float = field(metadata=config(field_name="lambda")) + preemtible_jobs: float = field(metadata=config(field_name="preemptJobs")) + + +@dataclass(order=True) +class TrainTask: + """ + Training description used by the orchestrator to generate tasks. Contains 'transposed' information of the + configuration file to make job generation easier and cleaner by using a 'flat' data class. + + Dataclass is ordered, to allow for ordering of arrived tasks in a PriorityQueue (for scheduling). + """ + priority: int + network_configuration: NetworkConfiguration = field(compare=False) + system_parameters: SystemParameters = field(compare=False) + hyper_parameters: HyperParameters = field(compare=False) + identifier: str = field(compare=False) + + def __init__(self, identity: str, job_parameters: JobClassParameter, priority: Priority): + """ + Overridden init method for dataclass, to allow for 'exploding' a JobDescription object to a flattened object. + @param job_parameters: + @type job_parameters: + @param job_description: + @type job_description: + @param priority: + @type priority: + """ + self.identifier = identity + self.network_configuration = job_parameters.network_configuration + self.system_parameters = job_parameters.system_parameters + self.hyper_parameters = job_parameters.hyper_parameters + self.priority = priority.priority + + +class ExperimentParser(object): + + def __init__(self, config_path: Path): + self.__config_path = config_path + + def parse(self) -> List[JobDescription]: + """ + Parse function to load JSON conf into JobDescription objects. Any changes to the JSON file format + should be reflected by the classes used. For more information refer to the dataclasses JSON + documentation https://pypi.org/project/dataclasses-json/. + """ + with open(self.__config_path, 'r') as config_file: + config_dict = json.load(config_file) + job_list = [JobDescription.from_dict(job_description) for job_description in config_dict] + return job_list diff --git a/fltk/util/task/generator/__init__.py b/fltk/util/task/generator/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/fltk/util/task/generator/arrival_generator.py b/fltk/util/task/generator/arrival_generator.py new file mode 100644 index 00000000..9f453e9b --- /dev/null +++ b/fltk/util/task/generator/arrival_generator.py @@ -0,0 +1,174 @@ +import logging +import multiprocessing +import time +from abc import abstractmethod +from dataclasses import dataclass +from pathlib import Path +from queue import Queue +from random import choices +from typing import Dict, List, Union + +import numpy as np + +from fltk.util.singleton import Singleton +from fltk.util.task.config.parameter import TrainTask, JobDescription, ExperimentParser, JobClassParameter + + +@dataclass +class ArrivalGenerator(metaclass=Singleton): + """ + Abstract Base Class for generating arrivals in the system. These tasks must be run + """ + + configuration_path: Path + logger: logging.Logger = None + arrivals: "Queue[Arrival]" = Queue() + + @abstractmethod + def load_config(self): + raise NotImplementedError("Cannot call abstract function") + + @abstractmethod + def generate_arrival(self, task_id): + """ + Function to generate arrival based on a Task ID. + @param task_id: + @type task_id: + @return: + @rtype: + """ + raise NotImplementedError("Cannot call abstract function") + + +@dataclass +class Arrival: + ticks: int + task: TrainTask + task_id: str + + def get_priority(self): + return self.task.priority + + def get_network(self) -> str: + return self.task.network_configuration.network + + def get_dataset(self) -> str: + return self.task.network_configuration.dataset + + def get_system_config(self): + return self.task.system_parameters + + def get_parameter_config(self): + return self.task.hyper_parameters + + +class ExperimentGenerator(ArrivalGenerator): + start_time: float = -1 + stop_time: float = -1 + job_dict: Dict[str, JobDescription] = None + + _tick_list: List[Arrival] = [] + _alive: bool = False + _decrement = 10 + __default_config: Path = Path('configs/tasks/example_arrival_config.json') + + def __init__(self, custom_config: Path = None): + super(ExperimentGenerator, self).__init__(custom_config or self.__default_config) + self.load_config() + + def set_logger(self, name: str = None): + """ + Set logging name of the ArrrivalGenerator object to a recognizable name. Needs to be called once, as otherwise + the logger is Uninitialized, resulting in failed execution. + @param name: Name to use, by default the name 'ArrivalGenerator' is used. + @type name: str + @return: None + @rtype: None + """ + logging_name = name or self.__class__.__name__ + self.logger = logging.getLogger(logging_name) + + def load_config(self, alternative_path: Path = None): + """ + Load configuration from default path, if alternative path is not provided. + @param alternative_path: Optional non-default location to load the configuration from. + @type alternative_path: Path + @return: None + @rtype: None + """ + parser = ExperimentParser(config_path=alternative_path or self.configuration_path) + experiment_descriptions = parser.parse() + self.job_dict = {f'train_job_{indx}': item for indx, item in enumerate(experiment_descriptions)} + + def generate_arrival(self, task_id: str) -> Arrival: + """ + Generate a training task for a JobDescription once the inter-arrival time has been 'deleted'. + @param task_id: identifier for a training task corresponding to the JobDescription. + @type task_id: str + @return: generated arrival corresponding to the unique task_id. + @rtype: Arrival + """ + self.logger.info(f"Creating task for {task_id}") + job: JobDescription = self.job_dict[task_id] + parameters: JobClassParameter = choices(job.job_class_parameters, [param.class_probability for param in job.job_class_parameters])[0] + priority = choices(parameters.priorities, [prio.probability for prio in parameters.priorities], k=1)[0] + + inter_arrival_ticks = np.random.poisson(lam=job.arrival_statistic) + train_task = TrainTask(task_id, parameters, priority) + + return Arrival(inter_arrival_ticks, train_task, task_id) + + def start(self, duration: Union[float, int]): + """ + Function to start arrival generator, requires to + @return: None + @rtype: None + """ + if not self.logger: + self.set_logger() + self.logger.info("Starting execution of arrival generator...") + self._alive = True + self.run(duration) + + def stop(self) -> None: + """ + Function to call when the generator needs to stop. By default the generator will run for 1 hour. + @return: None + @rtype: None + """ + self.logger.info("Received stopping signal") + self._alive = False + + def run(self, duration: float): + """ + Run function to generate arrivals during existence of the Orchestrator. Accounts time-drift correction for + long-term execution duration of the generator (i.e. for time taken by Python interpreter). + @return: None + @rtype: None + """ + self.start_time = time.time() + self.logger.info("Populating tick lists with initial arrivals") + for task_id in self.job_dict.keys(): + new_arrival: Arrival = self.generate_arrival(task_id) + self._tick_list.append(new_arrival) + self.logger.info(f"Arrival {new_arrival} arrives at {new_arrival.ticks} seconds") + event = multiprocessing.Event() + while self._alive and time.time() - self.start_time < duration: + save_time = time.time() + + new_scheduled = [] + for entry in self._tick_list: + entry.ticks -= self._decrement + if entry.ticks <= 0: + self.arrivals.put(entry) + new_arrival = self.generate_arrival(entry.task_id) + new_scheduled.append(new_arrival) + self.logger.info(f"Arrival {new_arrival} arrives at {new_arrival.ticks} seconds") + else: + new_scheduled.append(entry) + self._tick_list = new_scheduled + # Correct for time drift between execution, otherwise drift adds up, and arrivals don't generate correctly + correction_time = time.time() - save_time + event.wait(timeout=self._decrement - correction_time) + self.stop_time = time.time() + self.logger.info(f"Stopped execution at: {self.stop_time}, duration: {self.stop_time - self.start_time}/{duration}") diff --git a/fltk/util/task/task.py b/fltk/util/task/task.py new file mode 100644 index 00000000..48020427 --- /dev/null +++ b/fltk/util/task/task.py @@ -0,0 +1,22 @@ +from dataclasses import field, dataclass +from uuid import UUID + +from fltk.util.task.config.parameter import SystemParameters, HyperParameters + + +@dataclass(order=True) +class ArrivalTask: + """ + Object to contain configuration of training task. It describes the following properties; + * Number of machines + * System-configuration + * Network + * Dataset + * Hyper-parameters + """ + priority: int + id: UUID = field(compare=False) + network: str = field(compare=False) + dataset: str = field(compare=False) + sys_conf: SystemParameters = field(compare=False) + param_conf: HyperParameters = field(compare=False) diff --git a/fltk/util/tensor_converter.py b/fltk/util/tensor_converter.py deleted file mode 100644 index f5f7abee..00000000 --- a/fltk/util/tensor_converter.py +++ /dev/null @@ -1,20 +0,0 @@ -import numpy - -def convert_distributed_data_into_numpy(distributed_dataset): - """ - Converts a distributed dataset (returned by a data distribution method) from Tensors into numpy arrays. - - :param distributed_dataset: Distributed dataset - :type distributed_dataset: list(tuple) - """ - converted_distributed_dataset = [] - - for worker_idx in range(len(distributed_dataset)): - worker_training_data = distributed_dataset[worker_idx] - - X_ = numpy.array([tensor.numpy() for batch in worker_training_data for tensor in batch[0]]) - Y_ = numpy.array([tensor.numpy() for batch in worker_training_data for tensor in batch[1]]) - - converted_distributed_dataset.append((X_, Y_)) - - return converted_distributed_dataset diff --git a/jupyter/load_tensorboard.ipynb b/jupyter/load_tensorboard.ipynb new file mode 100644 index 00000000..232b95d5 --- /dev/null +++ b/jupyter/load_tensorboard.ipynb @@ -0,0 +1,251 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install tensorboard tensorflow pandas\n", + "\n", + "\"\"\"\n", + "If tensorboard is not installed (or other dependencies, such as tensorflow and pandas),\n", + "uncomment the command in top and re-run. This needs only to be run once in a Jupyter kernel.\n", + "\"\"\"\n", + "\n", + "%load_ext tensorboard\n", + "\n", + "from tensorflow.python.summary.summary_iterator import summary_iterator\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "\"\"\"\n", + "Change the LOG_DIR argument to point to the correct directory, you may want to use an\n", + "absolute path if you run into issues.\n", + "\"\"\"\n", + "%tensorboard --logdir ../logging" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "def logs_to_pandas(path: str) -> pd.DataFrame:\n", + " \"\"\"convert single tensorflow log file to pandas DataFrame\n", + " Parameters\n", + " ----------\n", + " path : str\n", + " path to tensorflow log file\n", + " Returns\n", + " -------\n", + " pd.DataFrame\n", + " converted dataframe\n", + " \"\"\"\n", + "\n", + " runlog_data = pd.DataFrame({\"metric\": [], \"value\": [], \"step\": [], \"wall_time\": []})\n", + " try:\n", + " event_acc = summary_iterator(path)\n", + " for event in list(event_acc)[1:]:\n", + " step, wall_time = event.step, pd.to_datetime(event.wall_time, unit='s')\n", + " simple_extractor = [{\"metric\": v.tag, \"value\": v.simple_value, \"step\": step, 'wall_time': wall_time} for v in event.summary.value]\n", + " event_r = pd.DataFrame(simple_extractor)\n", + " runlog_data = pd.concat([runlog_data, event_r])\n", + " # Dirty catch of DataLossError\n", + " except Exception as e:\n", + " print(\"Event file possibly corrupt: {}\".format(path))\n", + " print(e)\n", + " return runlog_data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /home/jeroen/fltk-testbed/fltk-testbed/venv/lib/python3.9/site-packages/tensorflow/python/summary/summary_iterator.py:31: tf_record_iterator (from tensorflow.python.lib.io.tf_record) is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "Use eager execution and: \n", + "`tf.data.TFRecordDataset(path)`\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
metricvaluestepwall_time
0training loss per epoch0.3589331.02021-09-28 07:41:37.473129472
0accuracy per epoch87.6999971.02021-09-28 07:41:37.473197568
0training loss per epoch0.2659722.02021-09-28 07:42:04.857327872
0accuracy per epoch88.3700032.02021-09-28 07:42:04.857392384
0training loss per epoch0.2668543.02021-09-28 07:42:32.819394048
0accuracy per epoch89.7799993.02021-09-28 07:42:32.819455744
0training loss per epoch0.2505564.02021-09-28 07:43:19.743513088
0accuracy per epoch90.1299974.02021-09-28 07:43:19.743613696
\n", + "
" + ], + "text/plain": [ + " metric value step wall_time\n", + "0 training loss per epoch 0.358933 1.0 2021-09-28 07:41:37.473129472\n", + "0 accuracy per epoch 87.699997 1.0 2021-09-28 07:41:37.473197568\n", + "0 training loss per epoch 0.265972 2.0 2021-09-28 07:42:04.857327872\n", + "0 accuracy per epoch 88.370003 2.0 2021-09-28 07:42:04.857392384\n", + "0 training loss per epoch 0.266854 3.0 2021-09-28 07:42:32.819394048\n", + "0 accuracy per epoch 89.779999 3.0 2021-09-28 07:42:32.819455744\n", + "0 training loss per epoch 0.250556 4.0 2021-09-28 07:43:19.743513088\n", + "0 accuracy per epoch 90.129997 4.0 2021-09-28 07:43:19.743613696" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "path = \"../logging/cloud_experiment_0_FashionMNISTCNN_ee232974-dcde-4977-8f3d-40bf1accabb2/events.out.tfevents.1632814874.not-ubuntu.74346.0\"\n", + "\n", + "logs_to_pandas(path)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "PyCharm (fltk-testbed)", + "language": "python", + "name": "pycharm-d7d3f210" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 00000000..6ffcf141 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,75 @@ +absl-py==0.12.0 +aiohttp==3.7.4.post0 +async-timeout==3.0.1 +attrs==21.2.0 +cachetools==4.2.2 +certifi==2020.12.5 +chardet==4.0.0 +colorful==0.5.4 +dataclass-csv==1.3.0 +dataclasses-json==0.5.4 +fsspec==2021.7.0 +future==0.18.2 +google-auth==1.30.0 +google-auth-oauthlib==0.4.4 +grpcio==1.37.1 +idna==2.10 +iniconfig==1.1.1 +iteration-utilities==0.11.0 +joblib==1.0.1 +kubeflow-pytorchjob==0.1.3 +kubernetes==17.17.0 +Markdown==3.3.4 +marshmallow==3.13.0 +marshmallow-enum==1.5.1 +memory-profiler==0.58.0 +multidict==5.1.0 +mypy-extensions==0.4.3 +numpy==1.20.2 +oauthlib==3.1.0 +packaging==21.0 +pandas==1.3.2 +Pillow==8.3.2 +Pint==0.17 +pluggy==1.0.0 +prettyprinter==0.18.0 +protobuf==3.16.0 +psutil==5.8.0 +py==1.10.0 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pyDeprecate==0.3.1 +Pygments==2.10.0 +pyparsing==2.4.7 +pytest==6.2.5 +python-dateutil==2.8.2 +python-dotenv==0.17.1 +pytorch-lightning==1.4.4 +pytz==2021.1 +PyYAML==5.4.1 +requests==2.25.1 +requests-oauthlib==1.3.0 +retrying==1.3.3 +rsa==4.7.2 +schedule==1.1.0 +scikit-learn==0.23.2 +scipy==1.6.3 +six==1.16.0 +stringcase==1.2.0 +table-logger==0.3.6 +tensorboard==2.5.0 +tensorboard-data-server==0.6.1 +tensorboard-plugin-wit==1.8.0 +threadpoolctl==2.1.0 +toml==0.10.2 +torch==1.9.0 +torchmetrics==0.5.0 +torchsummary==1.5.1 +torchvision==0.10.0 +tqdm==4.49.0 +typing-extensions==3.10.0.0 +typing-inspect==0.7.1 +urllib3==1.26.5 +websocket-client==1.2.0 +Werkzeug==1.0.1 +yarl==1.6.3 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..59a1a6d3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,70 @@ +absl-py==0.12.0 +aiohttp==3.7.4.post0 +async-timeout==3.0.1 +attrs==21.2.0 +cachetools==4.2.2 +certifi==2020.12.5 +chardet==4.0.0 +colorful==0.5.4 +dataclass-csv==1.3.0 +dataclasses-json==0.5.4 +fsspec==2021.7.0 +future==0.18.2 +google-auth==1.30.0 +google-auth-oauthlib==0.4.4 +grpcio==1.37.1 +idna==2.10 +iteration-utilities==0.11.0 +joblib==1.0.1 +kubeflow-pytorchjob==0.1.3 +kubernetes==17.17.0 +Markdown==3.3.4 +marshmallow==3.13.0 +marshmallow-enum==1.5.1 +memory-profiler==0.58.0 +multidict==5.1.0 +mypy-extensions==0.4.3 +numpy==1.20.2 +oauthlib==3.1.0 +packaging==21.0 +pandas==1.3.2 +Pillow==8.3.2 +Pint==0.17 +prettyprinter==0.18.0 +protobuf==3.16.0 +psutil==5.8.0 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pyDeprecate==0.3.1 +Pygments==2.10.0 +pyparsing==2.4.7 +python-dateutil==2.8.2 +python-dotenv==0.17.1 +pytorch-lightning==1.4.4 +pytz==2021.1 +PyYAML==5.4.1 +requests==2.25.1 +requests-oauthlib==1.3.0 +retrying==1.3.3 +rsa==4.7.2 +schedule==1.1.0 +scikit-learn==0.23.2 +scipy==1.6.3 +six==1.16.0 +stringcase==1.2.0 +table-logger==0.3.6 +tensorboard==2.5.0 +tensorboard-data-server==0.6.1 +tensorboard-plugin-wit==1.8.0 +threadpoolctl==2.1.0 +torch==1.9.0 +torchmetrics==0.5.0 +torchsummary==1.5.1 +torchvision==0.10.0 +tqdm==4.49.0 +typing-extensions==3.10.0.0 +typing-inspect==0.7.1 +urllib3==1.26.5 +websocket-client==1.2.0 +Werkzeug==1.0.1 +yarl==1.6.3 diff --git a/setup.py b/setup.py deleted file mode 100644 index 55bbcd75..00000000 --- a/setup.py +++ /dev/null @@ -1,35 +0,0 @@ -from setuptools import setup, find_packages -from fltk import __version__ -setup( - name="fltk", - author="Bart Cox", - author_email="b.a.cox@tudelft.nl", - maintainer="Bart Cox", - maintainer_email="b.a.cox@tudelft.nl", - description="Federated Learning Toolkit", - packages=find_packages(), - version=__version__, - entry_points={ - "console_scripts": [ - "fltk = fltk.__main__:main", - ] - }, - include_package_data=True, - data_files=[('share/tudelft/fltk/configs', ['configs/experiment.yaml'])], - install_requires= - [ - 'tqdm==4.49.0', - 'scikit-learn==0.23.2', - 'pandas==1.1.2', - 'numpy>=1.20.0', - 'torch==1.7.1', - 'torchvision==0.8.2', - 'scipy==1.4.1', - 'h5py==2.10.0', - 'requests', - 'pyyaml', - 'torchsummary', - 'dataclass-csv', - 'tensorboard' - ] -) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/nets/__init__.py b/tests/nets/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/nets/reproducibility_test.py b/tests/nets/reproducibility_test.py new file mode 100644 index 00000000..d0cb1c4b --- /dev/null +++ b/tests/nets/reproducibility_test.py @@ -0,0 +1,36 @@ +from typing import Type, OrderedDict + +import pytest +import torch + +from fltk.nets import Cifar10CNN, Cifar10ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152, Cifar100ResNet, \ + Cifar100VGG, FashionMNISTCNN, FashionMNISTResNet, SimpleMnist, SimpleNet +from fltk.nets.util import init_reproducibility + +models = [ + (Cifar10CNN), + (Cifar10ResNet), + (ResNet18), + (ResNet34), + (ResNet50), + (ResNet101), + (ResNet152), + (Cifar100ResNet), + (Cifar100VGG), + (FashionMNISTCNN), + (FashionMNISTResNet), + (SimpleMnist), + (SimpleNet) +] + +@pytest.mark.parametrize('network_class', models) +def test_reproducible_initialization(network_class: Type[torch.nn.Module]): + init_reproducibility() + param_1: OrderedDict[str, torch.nn.Module] = network_class().state_dict() + init_reproducibility() + param_2: OrderedDict[str, torch.nn.Module] = network_class().state_dict() + + for key, value in param_1.items(): + assert torch.equal(value, param_2.get(key)) + + del param_1, param_2 \ No newline at end of file