diff --git a/.gitignore b/.gitignore index c467fd2a..eb84f475 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +*.csv +*.json +*.png +*.pdf + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -133,6 +138,10 @@ venv-* data/** !data/.gitkeep output +docker_data .idea +*.tmp.txt +docker-compose.yml -logging/**/events.out.** \ No newline at end of file +refactor-notes.md +experiments/**/exps/* \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 6d542913..47a53e91 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,11 +1,14 @@ FROM ubuntu:20.04 -# Who maintains this DockerFile + MAINTAINER Jeroen Galjaard # Run build without interactive dialogue ARG DEBIAN_FRONTEND=noninteractive +# ENV GLOO_SOCKET_IFNAME=eth0 +# ENV TP_SOCKET_IFNAME=eth0 + # Define the working directory of the current Docker container WORKDIR /opt/federation-lab diff --git a/README.md b/README.md index 810621d1..76106cab 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,8 @@ Currently, it is assumed that Distributed Learning is performed (and *not* Feder extension of the project is planned to implement a `FederatedClient` that allows for a more realistic simulation of *Federated* Learning experiments. +### (Distributed Learning) + **General protocol:** 1. Client creation and spawning by the Orchestrator (using KubeFlows Pytorch-Operator) @@ -38,8 +40,30 @@ extension of the project is planned to implement a `FederatedClient` that allows * Data between clients (`WORLD_SIZE > 1`) is not shared * Hardware can heterogeneous * The location of devices matters (network latency and bandwidth) +* Communication is performed through RPC, aggregation is performed with `AllReduce`. + +### Federated Learning +**General protocol:** + +1. Client selection by the Federator. +2. The selected clients download the model. +3. Local training on the clients for X number of epochs +4. Weights/gradients of the trained model are send to the Federator +5. Federator aggregates the weights/gradients to create a new and improved model +6. Updated model is shared to the clients +7. Repeat step 1 to 6 until convergence/stopping condition. + +**Important notes:** + +* Data between clients is not shared to each other +* The data is non-IID +* Hardware can heterogeneous +* The location of devices matters (network latency and bandwidth) * Communication can be costly + + + ### Overview of deployed project When deploying the system, the following diagram shows how the system operates. `PyTorchJob`s are launched by the Orchestrator (see the [Orchestrator charts](./charts/orchestrator)). The Extractor keeps track of progress (see the @@ -381,7 +405,7 @@ helm install flearner ./orchestrator --namespace test -f fltk-values.yaml ``` This will spawn an `fl-server` Pod in the `test` Namespace, which will spawn Pods (using `V1PyTorchJobs`), that -run experiments. It will currently make use of the [`configs/example_cloud_experiment.json`](./configs/example_cloud_experiment.json) +run experiments. It will currently make use of the [`configs/example_cloud_experiment.json`](configs/benchmarking/example_cloud_experiment.json) default configuration. As described in the [values](./charts/orchestrator/values.yaml) file of the `Orchestrator`s Helm chart diff --git a/charts/orchestrator/values.yaml b/charts/orchestrator/values.yaml index ad94b4a1..387d34bd 100644 --- a/charts/orchestrator/values.yaml +++ b/charts/orchestrator/values.yaml @@ -1,4 +1,4 @@ orchestrator: cpu: 1000m memory: 2000000000 - configurationFile: example_cloud_experiment.json \ No newline at end of file + configurationFile: benchmarking/example_cloud_experiment.json \ No newline at end of file diff --git a/configs/example_cloud_experiment.json b/configs/example_cloud_experiment.json deleted file mode 100644 index b6055ff6..00000000 --- a/configs/example_cloud_experiment.json +++ /dev/null @@ -1,35 +0,0 @@ -{ - "cluster": { - "orchestrator": { - "wait_for_clients": true, - "service": "fl-server.test.svc.cluster.local", - "nic": "eth0" - }, - "client": { - "prefix": "client", - "tensorboard_active": false - }, - "image": "gcr.io/test-bed-distml/fltk:latest" - }, - "execution_config": { - "duration": 3600, - "experiment_prefix": "cloud_experiment", - "cuda": false, - "tensorboard": { - "active": true, - "record_dir": "logging" - }, - "net": { - "save_model": false, - "save_temp_model": false, - "save_epoch_interval": 1, - "save_model_path": "models", - "epoch_save_start_suffix": "start", - "epoch_save_end_suffix": "end" - }, - "reproducibility": { - "torch_seed": 42, - "arrival_seed": 123 - } - } -} \ No newline at end of file diff --git a/deploy/docker/stub_default.yml b/deploy/docker/stub_default.yml new file mode 100644 index 00000000..4023a178 --- /dev/null +++ b/deploy/docker/stub_default.yml @@ -0,0 +1,27 @@ +client_name: # name can be anything +# container_name: federation-lab-client2 # what the name for this container would be + cpuset: '{cpu_set}' + restart: "no" # if it crashes for example + build: . # look for the docker file where this file is currently located + volumes: + - ./data:/opt/federation-lab/data +# - ./docker_data:/opt/federation-lab/data + - ./default_models:/opt/federation-lab/default_models + - ./data_loaders:/opt/federation-lab/data_loaders + - ./fltk:/opt/federation-lab/fltk + environment: + - PYTHONUNBUFFERED=1 + - RANK={rank} + - WORLD_SIZE={world_size} + - EXP_CONFIG=${EXP_CONFIG_FILE} + - MASTER_HOSTNAME=10.5.0.11 + - NIC=eth0 + - OPTIONAL_PARAMS=${OPTIONAL_PARAMS} + ports: + - "5002:5000" # {machine-port}:{docker-port} + depends_on: + - "fl_server" + deploy: + resources: + limits: + cpus: '{num_cpus}' \ No newline at end of file diff --git a/deploy/docker/system_stub.yml b/deploy/docker/system_stub.yml new file mode 100644 index 00000000..01270c8c --- /dev/null +++ b/deploy/docker/system_stub.yml @@ -0,0 +1,30 @@ +# creating a multi-container docker +version: "3.3" +services: + fl_server: # name can be anything + container_name: federation-lab-server # what the name for this container would be + cpuset: '0-2' + restart: "no" # if it crashes for example + build: . # look for the docker file where this file is currently located + volumes: +# - ./data/MNIST:/opt/federation-lab/data/MNIST + - ./data:/opt/federation-lab/data + - ./output:/opt/federation-lab/output + - ./fltk:/opt/federation-lab/fltk + environment: + - PYTHONUNBUFFERED=1 + - RANK=0 + - WORLD_SIZE={world_size} + - EXP_CONFIG=${EXP_CONFIG_FILE} + - MASTER_HOSTNAME=10.5.0.11 + - NIC=eth0 + - OPTIONAL_PARAMS=${OPTIONAL_PARAMS} + ports: + - "5000:5000" # {machine-port}:{docker-port} + networks: + default: + ipv4_address: 10.5.0.11 +networks: + default: + external: + name: local_network_dev \ No newline at end of file diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 00000000..d3f2afac --- /dev/null +++ b/examples/README.md @@ -0,0 +1 @@ +These examples are outdated! \ No newline at end of file diff --git a/experiments/example_cuda/descr.yaml b/experiments/example_cuda/descr.yaml new file mode 100644 index 00000000..b42db5f4 --- /dev/null +++ b/experiments/example_cuda/descr.yaml @@ -0,0 +1,25 @@ +--- +# Experiment configuration +total_epochs: 3 +rounds: 5 +epochs_per_cycle: 1 +wait_for_clients: true +net: MNISTCNN +dataset: mnist +# Use cuda is available; setting to false will force CPU +cuda: true +profiling_time: 100 +warmup_round: false +output_location: 'output/example_cuda' +tensor_board_active: true +clients_per_round: 2 +node_groups: + slow: [1, 1] + medium: [2, 2] + fast: [3, 3] +sampler: "uniform" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default) +sampler_args: + - 0.07 # label limit || q probability || alpha || unused + - 42 # random seed || random seed || random seed || unused +num_clients: 10 +replications: 5 diff --git a/experiments/example_cuda/fedavg.cfg.yaml b/experiments/example_cuda/fedavg.cfg.yaml new file mode 100644 index 00000000..25a64bda --- /dev/null +++ b/experiments/example_cuda/fedavg.cfg.yaml @@ -0,0 +1,5 @@ +# Individual configuration +offload_stategy: vanilla +deadline: 500 +single_machine: true +real_time: false \ No newline at end of file diff --git a/experiments/example_docker/descr.yaml b/experiments/example_docker/descr.yaml new file mode 100644 index 00000000..b1a7aaa2 --- /dev/null +++ b/experiments/example_docker/descr.yaml @@ -0,0 +1,48 @@ +--- +# Experiment configuration +total_epochs: 3 +rounds: 5 +epochs_per_cycle: 1 +wait_for_clients: true +net: MNISTCNN +dataset: mnist +# Use cuda is available; setting to false will force CPU +cuda: false +profiling_time: 100 +warmup_round: false +output_location: 'output/example_docker' +tensor_board_active: true +clients_per_round: 2 +node_groups: + slow: [1, 1] + medium: [2, 2] + fast: [3, 3] +sampler: "uniform" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default) +sampler_args: + - 0.07 # label limit || q probability || alpha || unused + - 42 # random seed || random seed || random seed || unused +num_clients: 2 +replications: 2 +deploy: + docker: + base_path: deploy/docker + federator: + stub-name: system_stub.yml + pin-cores: true + num-cores: 1 + clients: + fast: + stub-name: stub_default.yml + amount: 2 + pin-cores: true + num-cores: 1 + cpu-speed: 1 + cpu-variation: 0 + slow: + stub-name: stub_default.yml + amount: 0 + pin-cores: true + num-cores: 1 + cpu-speed: 1 + cpu-variation: 0 + diff --git a/experiments/example_docker/fedavg.cfg.yaml b/experiments/example_docker/fedavg.cfg.yaml new file mode 100644 index 00000000..17bd81b1 --- /dev/null +++ b/experiments/example_docker/fedavg.cfg.yaml @@ -0,0 +1,5 @@ +# Individual configuration +offload_stategy: vanilla +deadline: 500 +single_machine: false +real_time: true \ No newline at end of file diff --git a/experiments/example_native/descr.yaml b/experiments/example_native/descr.yaml new file mode 100644 index 00000000..c254640b --- /dev/null +++ b/experiments/example_native/descr.yaml @@ -0,0 +1,25 @@ +--- +# Experiment configuration +total_epochs: 3 +rounds: 5 +epochs_per_cycle: 1 +wait_for_clients: true +net: MNISTCNN +dataset: mnist +# Use cuda is available; setting to false will force CPU +cuda: false +profiling_time: 100 +warmup_round: false +output_location: 'output/example_native' +tensor_board_active: true +clients_per_round: 2 +node_groups: + slow: [1, 1] + medium: [2, 2] + fast: [3, 3] +sampler: "uniform" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default) +sampler_args: + - 0.07 # label limit || q probability || alpha || unused + - 42 # random seed || random seed || random seed || unused +num_clients: 10 +replications: 5 diff --git a/experiments/example_native/fedavg.cfg.yaml b/experiments/example_native/fedavg.cfg.yaml new file mode 100644 index 00000000..25a64bda --- /dev/null +++ b/experiments/example_native/fedavg.cfg.yaml @@ -0,0 +1,5 @@ +# Individual configuration +offload_stategy: vanilla +deadline: 500 +single_machine: true +real_time: false \ No newline at end of file diff --git a/fltk/__init__.py b/fltk/__init__.py index aa6e546b..92989687 100644 --- a/fltk/__init__.py +++ b/fltk/__init__.py @@ -1,2 +1,2 @@ -__version__ = '0.3.1' \ No newline at end of file +__version__ = '0.4.0' \ No newline at end of file diff --git a/fltk/__main__.py b/fltk/__main__.py index c6332429..55b4d1f0 100644 --- a/fltk/__main__.py +++ b/fltk/__main__.py @@ -1,49 +1,68 @@ +import argparse import json import logging -from argparse import Namespace, ArgumentParser +import os +from argparse import Namespace from pathlib import Path +from torch.distributed import rpc + +from fltk.core.client import Client +from fltk.core.federator import Federator from fltk.launch import launch_client, launch_orchestrator, launch_extractor -from fltk.util.config.arguments import create_client_parser, create_cluster_parser, extract_learning_parameters, \ - create_extractor_parser -from fltk.util.config.base_config import BareConfig +from fltk.util.config import DistributedConfig, Config +from fltk.util.config.arguments import create_all_subparsers, extract_learning_parameters +from fltk.util.generate_experiments import generate, run def __main__(): - parser = ArgumentParser(description='Experiment launcher for the Federated Learning Testbed') - subparsers = parser.add_subparsers(dest="mode") - create_client_parser(subparsers) - create_cluster_parser(subparsers) - create_extractor_parser(subparsers) + parser = argparse.ArgumentParser(prog='fltk', + description='Experiment launcher for the Federated Learning Testbed (fltk)') + subparsers = parser.add_subparsers(dest="action", required=True) + create_all_subparsers(subparsers) """ To create your own parser mirror the construction in the 'client_parser' object. Or refer to the ArgumentParser library documentation. """ - arguments = parser.parse_args() + args = parser.parse_args() - with open(arguments.config, 'r') as config_file: - config: BareConfig = BareConfig.from_dict(json.load(config_file)) - config.config_path = Path(arguments.config) + with open(args.config, 'r') as config_file: + config: DistributedConfig = DistributedConfig.from_dict(json.load(config_file)) + config.config_path = Path(args.config) - if arguments.mode == 'cluster': + if args.action == 'util-docker': + raise NotImplementedError(args.action) + elif args.action == 'util-generate': + path = Path(args.path) + print(f'generate for {path}') + generate(path) + elif args.action == 'util-run': + run(Path(args.path)) + elif args.action == 'remote': + run_remote(Path(args.config), args.rank, parser, args.nic, args.host, args.prefix) + elif args.action == 'single': + # Run single machine mode + run_single(Path(args.config), args.prefix) + # TODO: Take out orchestrator and put in seperate file. + # Run with DistributedDataParallel (i.e. speed-up execution). + elif args.action == 'cluster': logging.info("Starting in cluster mode.") - cluster_start(arguments, config) - elif arguments.mode == 'client': + cluster_start(args, config) + elif args.action == 'client': logging.info("Starting in client mode") - client_start(arguments, config) + client_start(args, config) logging.info("Stopping client...") exit(0) - elif arguments.mode == 'extractor': - launch_extractor(arguments, config) + elif args.action == 'extractor': + launch_extractor(args, config) else: - print("Provided mode is not supported...") - exit(1) + raise NotImplementedError(args.action) -def cluster_start(args: Namespace, configuration: BareConfig): +def cluster_start(args: Namespace, configuration: DistributedConfig): """ - Function to to launch Orchestrator for execution with provided configurations. Currently + Function to launch Orchestrator for execution with provided configurations. Currently this assumes that a single Orchestrator is started that manages all the resources in the cluster. """ logging.basicConfig(level=logging.DEBUG, @@ -54,7 +73,7 @@ def cluster_start(args: Namespace, configuration: BareConfig): launch_orchestrator(args=args, conf=configuration) -def client_start(args: Namespace, configuration: BareConfig): +def client_start(args: Namespace, configuration: DistributedConfig): learning_params = extract_learning_parameters(args) # Set the seed for PyTorch, numpy seed is mostly ignored. Set the `torch_seed` to a different value # for each repetition that you want to run an experiment with. @@ -63,10 +82,85 @@ def client_start(args: Namespace, configuration: BareConfig): launch_client(task_id, config=configuration, learning_params=learning_params, namespace=args) +def run_single(config_path: Path, prefix: str = None): + # We can iterate over all the experiments in the directory and execute it, as long as the system remains the same! + # System = machines and its configuration + print(config_path) + config = Config.FromYamlFile(config_path) + config.world_size = config.num_clients + 1 + config.replication_id = prefix + federator_node = Federator('federator', 0, config.world_size, config) + federator_node.run() + + +def retrieve_env_params(nic=None, host=None): + if host: + os.environ['MASTER_ADDR'] = host + os.environ['MASTER_PORT'] = '5000' + if nic: + os.environ['GLOO_SOCKET_IFNAME'] = nic + os.environ['TP_SOCKET_IFNAME'] = nic + + +def retrieve_network_params_from_config(config: Config, nic=None, host=None): + if hasattr(config, 'system'): + system_attr = getattr(config, 'system') + if 'federator' in system_attr: + if 'hostname' in system_attr['federator'] and not host: + host = system_attr['federator']['hostname'] + if 'nic' in system_attr['federator'] and not nic: + nic = system_attr['federator']['nic'] + return nic, host + + +def run_remote(config_path: Path, rank: int, parser, nic=None, host=None, prefix: str = None): + print(config_path, rank) + config = Config.FromYamlFile(config_path) + config.world_size = config.num_clients + 1 + config.replication_id = prefix + nic, host = retrieve_network_params_from_config(config, nic, host) + if not nic or not host: + print('Missing rank, host, world-size, or nic argument when in \'remote\' mode!') + parser.print_help() + exit(1) + retrieve_env_params(nic, host) + print(f'Starting with host={os.environ["MASTER_ADDR"]} and port={os.environ["MASTER_PORT"]} and interface={nic}') + options = rpc.TensorPipeRpcBackendOptions( + num_worker_threads=16, + rpc_timeout=0, # infinite timeout + # init_method=f'tcp://{os.environ["MASTER_ADDR"]}:{os.environ["MASTER_PORT"]}' + init_method='env://', + _transports=["uv"] + ) + if rank != 0: + print(f'Starting worker {rank} with world size={config.world_size}') + rpc.init_rpc( + f"client{rank}", + rank=rank, + world_size=config.world_size, + rpc_backend_options=options, + ) + client_node = Client(f'client{rank}', rank, config.world_size, config) + client_node.remote_registration() + else: + print(f'Starting the ps with world size={config.world_size}') + rpc.init_rpc( + "federator", + rank=rank, + world_size=config.world_size, + rpc_backend_options=options + + ) + federator_node = Federator('federator', 0, config.world_size, config) + federator_node.run() + federator_node.stop_all_clients() + print('Ending program') + + if __name__ == "__main__": root = logging.getLogger() if root.handlers: for handler in root.handlers: root.removeHandler(handler) - logging.basicConfig(format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d-%Y %H:%M:%S',) + logging.basicConfig(format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d-%Y %H:%M:%S', ) __main__() diff --git a/fltk/core/__init__.py b/fltk/core/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/fltk/core/client.py b/fltk/core/client.py new file mode 100644 index 00000000..8141411d --- /dev/null +++ b/fltk/core/client.py @@ -0,0 +1,146 @@ +import time +from typing import Tuple, Any + +import torch + +from fltk.core.node import Node +from fltk.schedulers import MinCapableStepLR +from fltk.strategy import get_optimizer +from fltk.util.config import Config + + +class Client(Node): + running = False + + def __init__(self, id: str, rank: int, world_size: int, config: Config): + super().__init__(id, rank, world_size, config) + + self.loss_function = self.config.get_loss_function()() + self.optimizer = get_optimizer(self.config.optimizer)(self.net.parameters(), + **self.config.optimizer_args) + self.scheduler = MinCapableStepLR(self.logger, self.optimizer, + self.config.scheduler_step_size, + self.config.scheduler_gamma, + self.config.min_lr) + + def remote_registration(self): + self.logger.info('Sending registration') + self.message('federator', 'ping', 'new_sender', be_weird=True) + self.message('federator', 'register_client', self.id, self.rank) + self.running = True + self._event_loop() + + def stop_client(self): + self.logger.info('Got call to stop event loop') + self.running = False + + def _event_loop(self): + self.logger.info('Starting event loop') + while self.running: + time.sleep(0.1) + self.logger.info('Exiting node') + + def train(self, num_epochs: int): + start_time = time.time() + + running_loss = 0.0 + final_running_loss = 0.0 + if self.distributed: + self.dataset.train_sampler.set_epoch(num_epochs) + + number_of_training_samples = len(self.dataset.get_train_loader()) + # self.logger.info(f'{self.id}: Number of training samples: {number_of_training_samples}') + + for i, (inputs, labels) in enumerate(self.dataset.get_train_loader(), 0): + inputs, labels = inputs.to(self.device), labels.to(self.device) + + # zero the parameter gradients + self.optimizer.zero_grad() + + outputs = self.net(inputs) + loss = self.loss_function(outputs, labels) + + loss.backward() + self.optimizer.step() + running_loss += loss.item() + # Mark logging update step + if i % self.config.log_interval == 0: + self.logger.info( + '[%s] [%d, %5d] loss: %.3f' % (self.id, num_epochs, i, running_loss / self.config.log_interval)) + final_running_loss = running_loss / self.config.log_interval + running_loss = 0.0 + # break + + end_time = time.time() + duration = end_time - start_time + # self.logger.info(f'Train duration is {duration} seconds') + + return final_running_loss, self.get_nn_parameters(), + + def set_tau_eff(self, total): + client_weight = self.get_client_datasize() / total + n = self.get_client_datasize() + E = self.config.epochs + B = 16 # nicely hardcoded :) + tau_eff = int(E * n / B) * client_weight + if hasattr(self.optimizer, 'set_tau_eff'): + self.optimizer.set_tau_eff(tau_eff) + + def test(self): + start_time = time.time() + correct = 0 + total = 0 + targets_ = [] + pred_ = [] + loss = 0.0 + with torch.no_grad(): + for (images, labels) in self.dataset.get_test_loader(): + images, labels = images.to(self.device), labels.to(self.device) + + outputs = self.net(images) + + _, predicted = torch.max(outputs.data, 1) + total += labels.size(0) + correct += (predicted == labels).sum().item() + + targets_.extend(labels.cpu().view_as(predicted).numpy()) + pred_.extend(predicted.cpu().numpy()) + + loss += self.loss_function(outputs, labels).item() + loss /= len(self.dataset.get_test_loader().dataset) + accuracy = 100.0 * correct / total + # confusion_mat = confusion_matrix(targets_, pred_) + # accuracy_per_class = confusion_mat.diagonal() / confusion_mat.sum(1) + # + # class_precision = calculate_class_precision(confusion_mat) + # class_recall = calculate_class_recall(confusion_mat) + end_time = time.time() + duration = end_time - start_time + # self.logger.info(f'Test duration is {duration} seconds') + return accuracy, loss + + def get_client_datasize(self): + return len(self.dataset.get_train_sampler()) + + def exec_round(self, num_epochs: int) -> Tuple[Any, Any, Any, Any, float, float, float]: + + start = time.time() + + loss, weights = self.train(num_epochs) + time_mark_between = time.time() + accuracy, test_loss = self.test() + + end = time.time() + round_duration = end - start + train_duration = time_mark_between - start + test_duration = end - time_mark_between + # self.logger.info(f'Round duration is {duration} seconds') + + if hasattr(self.optimizer, 'pre_communicate'): # aka fednova or fedprox + self.optimizer.pre_communicate() + for k, v in weights.items(): + weights[k] = v.cpu() + return loss, weights, accuracy, test_loss, round_duration, train_duration, test_duration + + def __del__(self): + self.logger.info(f'Client {self.id} is stopping') \ No newline at end of file diff --git a/fltk/core/distributed/__init__.py b/fltk/core/distributed/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/fltk/client.py b/fltk/core/distributed/client.py similarity index 97% rename from fltk/client.py rename to fltk/core/distributed/client.py index 0a1f7c9d..0ec20f22 100644 --- a/fltk/client.py +++ b/fltk/core/distributed/client.py @@ -9,16 +9,20 @@ from sklearn.metrics import confusion_matrix from torch.utils.tensorboard import SummaryWriter +from fltk.core.distributed.dist_node import DistNode from fltk.nets.util import calculate_class_precision, calculate_class_recall, save_model, load_model_from_file from fltk.schedulers import MinCapableStepLR, LearningScheduler from fltk.util.config.arguments import LearningParameters -from fltk.util.config.base_config import BareConfig +from fltk.util.config import DistributedConfig from fltk.util.results import EpochData -class Client(object): +class Client(DistNode): + """ + TODO: Combine with Client and differentiate between Federated and Distributed Learnign through better inheritance. + """ - def __init__(self, rank: int, task_id: str, world_size: int, config: BareConfig = None, + def __init__(self, rank: int, task_id: str, world_size: int, config: DistributedConfig = None, learning_params: LearningParameters = None): """ @param rank: PyTorch rank provided by KubeFlow setup. diff --git a/fltk/core/distributed/dist_node.py b/fltk/core/distributed/dist_node.py new file mode 100644 index 00000000..2f9cfe59 --- /dev/null +++ b/fltk/core/distributed/dist_node.py @@ -0,0 +1,5 @@ +import abc + + +class DistNode(abc.ABC): + pass \ No newline at end of file diff --git a/fltk/extractor.py b/fltk/core/distributed/extractor.py similarity index 88% rename from fltk/extractor.py rename to fltk/core/distributed/extractor.py index a0303b94..c6c0ac59 100644 --- a/fltk/extractor.py +++ b/fltk/core/distributed/extractor.py @@ -3,10 +3,10 @@ from torchvision.datasets import FashionMNIST, CIFAR10, CIFAR100, MNIST -from fltk.util.config import BareConfig +from fltk.util.config import DistributedConfig -def download_datasets(args: Namespace, config: BareConfig): +def download_datasets(args: Namespace, config: DistributedConfig): """ Function to Download datasets to a system. This is currently meant to be run (using the extractor mode of FLTK) to download all datasets into the `data` directory and include it in the Docker image that is build for the project. @@ -15,7 +15,7 @@ def download_datasets(args: Namespace, config: BareConfig): @param args: Namespace object. @type args: Namespace @param config: FLTK configuration file, for finding the path where the datasets should be stored. - @type config: BareConfig + @type config: DistributedConfig @return: None @rtype: None """ diff --git a/fltk/orchestrator.py b/fltk/core/distributed/orchestrator.py similarity index 96% rename from fltk/orchestrator.py rename to fltk/core/distributed/orchestrator.py index 174626a1..ef4bf07c 100644 --- a/fltk/orchestrator.py +++ b/fltk/core/distributed/orchestrator.py @@ -8,13 +8,15 @@ from kubeflow.pytorchjob.constants.constants import PYTORCHJOB_GROUP, PYTORCHJOB_VERSION, PYTORCHJOB_PLURAL from kubernetes import client +from fltk.core.distributed.dist_node import DistNode from fltk.util.cluster.client import construct_job, ClusterManager -from fltk.util.config.base_config import BareConfig + +from fltk.util.config import DistributedConfig from fltk.util.task.generator.arrival_generator import ArrivalGenerator, Arrival from fltk.util.task.task import ArrivalTask -class Orchestrator(object): +class Orchestrator(DistNode): """ Central component of the Federated Learning System: The Orchestrator @@ -36,7 +38,7 @@ class Orchestrator(object): deployed_tasks: List[ArrivalTask] = [] completed_tasks: List[str] = [] - def __init__(self, cluster_mgr: ClusterManager, arv_gen: ArrivalGenerator, config: BareConfig): + def __init__(self, cluster_mgr: ClusterManager, arv_gen: ArrivalGenerator, config: DistributedConfig): self.__logger = logging.getLogger('Orchestrator') self.__logger.debug("Loading in-cluster configuration") self.__cluster_mgr = cluster_mgr diff --git a/fltk/core/federator.py b/fltk/core/federator.py new file mode 100644 index 00000000..7faa8578 --- /dev/null +++ b/fltk/core/federator.py @@ -0,0 +1,251 @@ +import copy +import time +from pathlib import Path +from typing import List, Union + +import torch +from tqdm import tqdm + +from fltk.core.client import Client +from fltk.core.node import Node +from fltk.datasets.loader_util import get_dataset +from fltk.strategy import FedAvg, random_selection, average_nn_parameters, average_nn_parameters_simple +from fltk.util.config import Config +from dataclasses import dataclass + +from fltk.util.data_container import DataContainer, FederatorRecord, ClientRecord +from fltk.strategy import get_aggregation + +NodeReference = Union[Node, str] +@dataclass +class LocalClient: + name: str + ref: NodeReference + data_size: int + exp_data: DataContainer + + +def cb_factory(future: torch.Future, method, *args, **kwargs): + future.then(lambda x: method(x, *args, **kwargs)) + + +class Federator(Node): + clients: List[LocalClient] = [] + # clients: List[NodeReference] = [] + num_rounds: int + exp_data: DataContainer + + def __init__(self, id: str, rank: int, world_size: int, config: Config): + super().__init__(id, rank, world_size, config) + self.loss_function = self.config.get_loss_function()() + self.num_rounds = config.rounds + self.config = config + prefix_text = '' + if config.replication_id: + prefix_text = f'_r{config.replication_id}' + config.output_path = Path(config.output_path) / f'{config.experiment_prefix}{prefix_text}' + self.exp_data = DataContainer('federator', config.output_path, FederatorRecord, config.save_data_append) + self.aggregation_method = get_aggregation(config.aggregation) + + + + def create_clients(self): + self.logger.info('Creating clients') + if self.config.single_machine: + # Create direct clients + world_size = self.config.num_clients + 1 + for client_id in range(1, self.config.num_clients+ 1): + client_name = f'client{client_id}' + client = Client(client_name, client_id, world_size, copy.deepcopy(self.config)) + self.clients.append(LocalClient(client_name, client, 0, DataContainer(client_name, self.config.output_path, + ClientRecord, self.config.save_data_append))) + self.logger.info(f'Client "{client_name}" created') + + def register_client(self, client_name, rank): + self.logger.info(f'Got new client registration from client {client_name}') + if self.config.single_machine: + self.logger.warning('This function should not be called when in single machine mode!') + self.clients.append(LocalClient(client_name, client_name, rank, DataContainer(client_name, self.config.output_path, + ClientRecord, self.config.save_data_append))) + + def stop_all_clients(self): + for client in self.clients: + self.message(client.ref, Client.stop_client) + + + def _num_clients_online(self) -> int: + return len(self.clients) + + def _all_clients_online(self) -> bool: + return len(self.clients) == self.world_size - 1 + + def clients_ready(self): + """ + Synchronous implementation + """ + all_ready = False + ready_clients = [] + while not all_ready: + responses = [] + all_ready = True + for client in self.clients: + resp = self.message(client.ref, Client.is_ready) + if resp: + self.logger.info(f'Client {client} is ready') + else: + self.logger.info(f'Waiting for client {client}') + all_ready = False + time.sleep(2) + + def get_client_data_sizes(self): + for client in self.clients: + client.data_size = self.message(client.ref, Client.get_client_datasize) + + def run(self): + # Load dataset with world size 2 to load the whole dataset. + # Caused by the fact that the dataloader subtracts 1 from the world size to exclude the federator by default. + self.init_dataloader(world_size=2) + + self.create_clients() + while not self._all_clients_online(): + self.logger.info(f'Waiting for all clients to come online. Waiting for {self.world_size - 1 -self._num_clients_online()} clients') + time.sleep(2) + self.logger.info('All clients are online') + # self.logger.info('Running') + # time.sleep(10) + self.client_load_data() + self.get_client_data_sizes() + self.clients_ready() + # self.logger.info('Sleeping before starting communication') + # time.sleep(20) + for communication_round in range(self.config.rounds): + self.exec_round(communication_round) + + self.save_data() + self.logger.info('Federator is stopping') + + + def save_data(self): + self.exp_data.save() + for client in self.clients: + client.exp_data.save() + + def client_load_data(self): + for client in self.clients: + self.message(client.ref, Client.init_dataloader) + + def set_tau_eff(self): + total = sum(client.data_size for client in self.clients) + # responses = [] + for client in self.clients: + self.message(client.ref, Client.set_tau_eff, client.ref, total) + # responses.append((client, _remote_method_async(Client.set_tau_eff, client.ref, total))) + # torch.futures.wait_all([x[1] for x in responses]) + + def test(self, net): + start_time = time.time() + correct = 0 + total = 0 + targets_ = [] + pred_ = [] + loss = 0.0 + with torch.no_grad(): + for (images, labels) in self.dataset.get_test_loader(): + images, labels = images.to(self.device), labels.to(self.device) + + outputs = net(images) + + _, predicted = torch.max(outputs.data, 1) + total += labels.size(0) + correct += (predicted == labels).sum().item() + + targets_.extend(labels.cpu().view_as(predicted).numpy()) + pred_.extend(predicted.cpu().numpy()) + + loss += self.loss_function(outputs, labels).item() + loss /= len(self.dataset.get_test_loader().dataset) + accuracy = 100.0 * correct / total + # confusion_mat = confusion_matrix(targets_, pred_) + # accuracy_per_class = confusion_mat.diagonal() / confusion_mat.sum(1) + # + # class_precision = calculate_class_precision(confusion_mat) + # class_recall = calculate_class_recall(confusion_mat) + end_time = time.time() + duration = end_time - start_time + self.logger.info(f'Test duration is {duration} seconds') + return accuracy, loss + + def exec_round(self, id: int): + start_time = time.time() + num_epochs = self.config.epochs + + # Client selection + selected_clients: List[LocalClient] + selected_clients = random_selection(self.clients, self.config.clients_per_round) + + last_model = self.get_nn_parameters() + for client in selected_clients: + self.message(client.ref, Client.update_nn_parameters, last_model) + + # Actual training calls + client_weights = {} + client_sizes = {} + # pbar = tqdm(selected_clients) + # for client in pbar: + + # Client training + training_futures: List[torch.Future] = [] + + + # def cb_factory(future: torch.Future, method, client, client_weights, client_sizes, num_epochs, name): + # future.then(lambda x: method(x, client, client_weights, client_sizes, num_epochs, client.name)) + + def training_cb(fut: torch.Future, client_ref: LocalClient, client_weights, client_sizes, num_epochs): + train_loss, weights, accuracy, test_loss, round_duration, train_duration, test_duration = fut.wait() + self.logger.info(f'Training callback for client {client_ref.name} with accuracy={accuracy}') + client_weights[client_ref.name] = weights + client_data_size = self.message(client_ref.ref, Client.get_client_datasize) + client_sizes[client_ref.name] = client_data_size + client_ref.exp_data.append( + ClientRecord(id, train_duration, test_duration, round_duration, num_epochs, 0, accuracy, train_loss, + test_loss)) + + for client in selected_clients: + future = self.message_async(client.ref, Client.exec_round, num_epochs) + cb_factory(future, training_cb, client, client_weights, client_sizes, num_epochs) + self.logger.info(f'Request sent to client {client.name}') + training_futures.append(future) + + def all_futures_done(futures: List[torch.Future])->bool: + return all(map(lambda x: x.done(), futures)) + + while not all_futures_done(training_futures): + time.sleep(0.1) + self.logger.info('') + # self.logger.info(f'Waiting for other clients') + + self.logger.info(f'Continue with rest [1]') + time.sleep(3) + + # for client in selected_clients: + # # pbar.set_description(f'[Round {id:>3}] Running clients') + # train_loss, weights, accuracy, test_loss, round_duration, train_duration, test_duration = self.message(client.ref, Client.exec_round, num_epochs) + # client_weights[client.name] = weights + # client_data_size = self.message(client.ref, Client.get_client_datasize) + # client_sizes[client.name] = client_data_size + # client.exp_data.append(ClientRecord(id, train_duration, test_duration, round_duration, num_epochs, 0, accuracy, train_loss, test_loss)) + # # self.logger.info(f'[Round {id:>3}] Client {client} has a accuracy of {accuracy}, train loss={train_loss}, test loss={test_loss},datasize={client_data_size}') + + # updated_model = FedAvg(client_weights, client_sizes) + updated_model = self.aggregation_method(client_weights, client_sizes) + # updated_model = average_nn_parameters_simple(list(client_weights.values())) + self.update_nn_parameters(updated_model) + + test_accuracy, test_loss = self.test(self.net) + self.logger.info(f'[Round {id:>3}] Federator has a accuracy of {test_accuracy} and loss={test_loss}') + + end_time = time.time() + duration = end_time - start_time + self.exp_data.append(FederatorRecord(len(selected_clients), id, duration, test_loss, test_accuracy)) + self.logger.info(f'[Round {id:>3}] Round duration is {duration} seconds') + diff --git a/fltk/core/node.py b/fltk/core/node.py new file mode 100644 index 00000000..a554a5b5 --- /dev/null +++ b/fltk/core/node.py @@ -0,0 +1,199 @@ +import copy +import os +from typing import Callable, Any + +import torch + +# from fltk.core.rpc_util import _remote_method_direct +from torch.distributed import rpc + +from fltk.datasets.loader_util import get_dataset +from fltk.nets import get_net +from fltk.util.config import Config +from fltk.util.log import getLogger + +global_vars = {} + + +def _remote_method_direct(method, other_node: str, *args, **kwargs): + # Client example + # ret = rpc.rpc_async(self.client_to_offload_to, Client.offload_receive_endpoint, args=([model_weights, i, self.id, local_updates_left])) + + args = [method, other_node] + list(args) + # return rpc.rpc_sync(other_node, _call_method, args=args, kwargs=kwargs) + return rpc.rpc_sync(other_node, method, args=args, kwargs=kwargs) + +class Node: + id: str + rank: int + world_size: int + counter = 0 + real_time = False + distributed = True + cuda = False + finished_init: bool = False + + device = torch.device("cpu") + net: Any + dataset: Any + logger = getLogger(__name__) + + + # _address_book = {} + + def __init__(self, id: str, rank: int, world_size: int, config: Config): + self.config = config + self.id = id + self.rank = rank + self.world_size = world_size + self.real_time = config.real_time + global global_vars + global_vars['self'] = self + self._config(config) + + def _config(self, config: Config): + self.logger.setLevel(config.log_level.value) + self.config.rank = self.rank + self.config.world_size = self.world_size + self.cuda = config.cuda + self.device = self.init_device() + self.distributed = config.distributed + self.set_net(self.load_default_model()) + + def init_dataloader(self, world_size: int = None): + config = copy.deepcopy(self.config) + if world_size: + config.world_size = world_size + self.logger.info(f'world size = {config.world_size} with rank={config.rank}') + self.dataset = get_dataset(config.dataset_name)(config) + self.finished_init = True + self.logger.info('Done with init') + + def is_ready(self): + return self.finished_init + + # def _add_address(self, node_name: str, ref: Any): + # self._address_book[node_name] = ref + + @staticmethod + def _receive(method: Callable, sender: str, *args, **kwargs): + global global_vars + # print('_receive') + # print(global_vars) + global_self = global_vars['self'] + # print(type(method)) + # print(type(global_self)) + if type(method) is str: + # print(f'Retrieving method from string: "{method}"') + method = getattr(global_self, method) + return method(*args, **kwargs) + else: + # print(method) + # print(global_self, *args, kwargs) + return method(global_self, *args, **kwargs) + + # def _lookup_reference(self, node_name: str): + + def init_device(self): + if self.cuda and not torch.cuda.is_available(): + self.logger.warning('Unable to configure device for GPU because cuda.is_available() == False') + if self.cuda and torch.cuda.is_available(): + self.logger.info("Configure device for GPU (Cuda)") + return torch.device("cuda:0") + else: + self.logger.info("Configure device for CPU") + return torch.device("cpu") + + def set_net(self, net): + self.net = net + self.net.to(self.device) + + # def load_model_from_file(self): + # model_class = self.args.get_net() + # default_model_path = os.path.join(self.args.get_default_model_folder_path(), model_class.__name__ + ".model") + # return self.load_model_from_file(default_model_path) + + def get_nn_parameters(self): + """ + Return the NN's parameters. + """ + return self.net.state_dict() + + def load_default_model(self): + """ + Load a model from default model file. + + This is used to ensure consistent default model behavior. + """ + model_class = get_net(self.config.net_name) + default_model_path = os.path.join(self.config.get_default_model_folder_path(), model_class.__name__ + ".model") + + return self.load_model_from_file(default_model_path) + + def load_model_from_file(self, model_file_path): + """ + Load a model from a file. + + :param model_file_path: string + """ + model_class = get_net(self.config.net_name) + model = model_class() + + if os.path.exists(model_file_path): + try: + model.load_state_dict(torch.load(model_file_path)) + except: + self.logger.warning("Couldn't load model. Attempting to map CUDA tensors to CPU to solve error.") + + model.load_state_dict(torch.load(model_file_path, map_location=torch.device('cpu'))) + else: + self.logger.warning("Could not find model: {}".format(model_file_path)) + return model + + + def update_nn_parameters(self, new_params, is_offloaded_model = False): + """ + Update the NN's parameters. + + :param new_params: New weights for the neural network + :type new_params: dict + """ + if is_offloaded_model: + pass + # self.offloaded_net.load_state_dict(copy.deepcopy(new_params), strict=True) + else: + self.net.load_state_dict(copy.deepcopy(new_params), strict=True) + # self.logger.info(f'Weights of the model are updated') + + def message(self, other_node: str, method: Callable, *args, **kwargs) -> torch.Future: + if self.real_time: + func = Node._receive + args_list = [method, self.id] + list(args) + return rpc.rpc_sync(other_node, func, args=args_list, kwargs=kwargs) + return method(other_node, *args, **kwargs) + + def message_async(self, other_node: str, method: Callable, *args, **kwargs) -> torch.Future: + if self.real_time: + func = Node._receive + args_list = [method, self.id] + list(args) + return rpc.rpc_async(other_node, func, args=args_list, kwargs=kwargs) + # Wrap inside a future to keep the logic the same + future = torch.futures.Future() + future.set_result(method(other_node, *args, **kwargs)) + return future + + # def register_client(self, client_name, rank): + # print(f'self={self}') + # self.logger.info(f'[Default Implementation!] Got new client registration from client {client_name}') + + def ping(self, sender: str, be_weird=False): + self.logger.info(f'Pong from {self.id}, got call from {sender} [{self.counter}]') + # print(f'Pong from {self.id}, got call from {sender} [{self.counter}]') + self.counter += 1 + if be_weird: + return 'AAAAAAAAAAAAAAAAAAAAAAHHHH!!!!' + else: + return f'Pong {self.counter}' + + def __repr__(self): + return str(self.id) diff --git a/fltk/core/rpc_util.py b/fltk/core/rpc_util.py new file mode 100644 index 00000000..b8aab507 --- /dev/null +++ b/fltk/core/rpc_util.py @@ -0,0 +1,30 @@ +import torch +from torch.distributed import rpc + +def _call_method(method, rref, *args, **kwargs): + """helper for _remote_method()""" + return method(rref.local_value(), *args, **kwargs) + +def _remote_method(method, rref, *args, **kwargs): + """ + executes method(*args, **kwargs) on the from the machine that owns rref + + very similar to rref.remote().method(*args, **kwargs), but method() doesn't have to be in the remote scope + """ + args = [method, rref] + list(args) + return rpc.rpc_sync(rref.owner(), _call_method, args=args, kwargs=kwargs) + + +def _remote_method_async(method, rref, *args, **kwargs) -> torch.Future: + args = [method, rref] + list(args) + return rpc.rpc_async(rref.owner(), _call_method, args=args, kwargs=kwargs) + + +def _remote_method_async_by_info(method, worker_info, *args, **kwargs): + args = [method, worker_info] + list(args) + return rpc.rpc_async(worker_info, _call_method, args=args, kwargs=kwargs) + +def _remote_method_direct(method, other_node: str, *args, **kwargs): + args = [method, other_node] + list(args) + # return rpc.rpc_sync(other_node, _call_method, args=args, kwargs=kwargs) + return rpc.rpc_sync(other_node, method, args=args, kwargs=kwargs) \ No newline at end of file diff --git a/fltk/datasets/__init__.py b/fltk/datasets/__init__.py index 38534a4d..6caa5832 100644 --- a/fltk/datasets/__init__.py +++ b/fltk/datasets/__init__.py @@ -1,4 +1,4 @@ from .cifar10 import CIFAR10Dataset from .cifar100 import CIFAR100Dataset from .fashion_mnist import FashionMNISTDataset -from .mnist import MNIST \ No newline at end of file +from .mnist import MNIST diff --git a/fltk/datasets/distributed/__init__.py b/fltk/datasets/distributed/__init__.py new file mode 100644 index 00000000..c008cda5 --- /dev/null +++ b/fltk/datasets/distributed/__init__.py @@ -0,0 +1,5 @@ +from .cifar10 import DistCIFAR10Dataset +from .cifar100 import DistCIFAR100Dataset +from .fashion_mnist import DistFashionMNISTDataset +from .mnist import DistMNISTDataset +from .dataset import DistDataset \ No newline at end of file diff --git a/fltk/datasets/distributed/cifar10.py b/fltk/datasets/distributed/cifar10.py new file mode 100644 index 00000000..31b0769a --- /dev/null +++ b/fltk/datasets/distributed/cifar10.py @@ -0,0 +1,48 @@ +from torchvision import datasets +from torchvision import transforms +from torch.utils.data import DataLoader, DistributedSampler + +from fltk.datasets.distributed.dataset import DistDataset +import logging + +from fltk.samplers import get_sampler + + +class DistCIFAR10Dataset(DistDataset): + + def __init__(self, args): + super(DistCIFAR10Dataset, self).__init__(args) + self.init_train_dataset() + self.init_test_dataset() + + def init_train_dataset(self): + dist_loader_text = "distributed" if self.args.get_distributed() else "" + self.logger.debug(f"Loading '{dist_loader_text}' CIFAR10 train data") + # self.get_args().get_logger().debug(f"Loading '{dist_loader_text}' CIFAR10 train data") + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + transform = transforms.Compose([ + transforms.RandomHorizontalFlip(), + transforms.RandomCrop(32, 4), + transforms.ToTensor(), + normalize + ]) + self.train_dataset = datasets.CIFAR10(root=self.get_args().get_data_path(), train=True, download=True, + transform=transform) + self.train_sampler = get_sampler(self.train_dataset, self.args) + self.train_loader = DataLoader(self.train_dataset, batch_size=16, sampler=self.train_sampler) + self.logger.info("this client gets {} samples".format(len(self.train_sampler))) + # logging.info("this client gets {} samples".format(len(self.train_sampler))) + + def init_test_dataset(self): + self.logger.debug("Loading CIFAR10 test data") + # self.get_args().get_logger().debug("Loading CIFAR10 test data") + + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + transform = transforms.Compose([ + transforms.ToTensor(), + normalize + ]) + self.test_dataset = datasets.CIFAR10(root=self.get_args().get_data_path(), train=False, download=True, + transform=transform) + self.test_sampler = get_sampler(self.test_dataset, self.args) + self.test_loader = DataLoader(self.test_dataset, batch_size=16, sampler=self.test_sampler) diff --git a/fltk/datasets/distributed/cifar100.py b/fltk/datasets/distributed/cifar100.py new file mode 100644 index 00000000..640677eb --- /dev/null +++ b/fltk/datasets/distributed/cifar100.py @@ -0,0 +1,89 @@ +from torchvision import datasets +from torchvision import transforms +from torch.utils.data import DataLoader, DistributedSampler +from fltk.datasets.distributed.dataset import DistDataset +from fltk.samplers import get_sampler + + +class DistCIFAR100Dataset(DistDataset): + + def __init__(self, args): + super(DistCIFAR100Dataset, self).__init__(args) + self.init_train_dataset() + self.init_test_dataset() + + def init_train_dataset(self): + dist_loader_text = "distributed" if self.args.get_distributed() else "" + self.logger.debug(f"Loading '{dist_loader_text}' CIFAR100 train data") + normalize = transforms.Normalize(mean=[0.507, 0.487, 0.441], std=[0.267, 0.256, 0.276]) + transform = transforms.Compose([ + transforms.RandomHorizontalFlip(), + transforms.RandomCrop(32, 4), + transforms.ToTensor(), + normalize + ]) + self.train_dataset = datasets.CIFAR100(root=self.get_args().get_data_path(), train=True, download=True, + transform=transform) + self.train_sampler = get_sampler(self.train_dataset, self.args) + self.train_loader = DataLoader(self.train_dataset, batch_size=16, sampler=self.train_sampler) + + def init_test_dataset(self): + dist_loader_text = "distributed" if self.args.get_distributed() else "" + self.logger.debug(f"Loading '{dist_loader_text}' CIFAR100 test data") + + normalize = transforms.Normalize(mean=[0.507, 0.487, 0.441], std=[0.267, 0.256, 0.276]) + transform = transforms.Compose([ + transforms.ToTensor(), + normalize + ]) + self.test_dataset = datasets.CIFAR100(root=self.get_args().get_data_path(), train=False, download=True, + transform=transform) + self.test_sampler = get_sampler(self.test_dataset, self.args) + self.test_loader = DataLoader(self.test_dataset, batch_size=16, sampler=self.test_sampler) + + + def load_train_dataset(self): + dist_loader_text = "distributed" if self.args.get_distributed() else "" + self.logger.debug(f"Loading '{dist_loader_text}' CIFAR100 train data") + + normalize = transforms.Normalize(mean=[0.507, 0.487, 0.441], std=[0.267, 0.256, 0.276]) + transform = transforms.Compose([ + transforms.RandomHorizontalFlip(), + transforms.RandomCrop(32, 4), + transforms.ToTensor(), + normalize + ]) + + train_dataset = datasets.CIFAR100(root=self.get_args().get_data_path(), train=True, download=True, + transform=transform) + sampler = get_sampler(self.test_dataset, self.args) + + train_loader = DataLoader(train_dataset, batch_size=len(train_dataset), sampler=sampler) + self.args.set_sampler(sampler) + + train_data = self.get_tuple_from_data_loader(train_loader) + dist_loader_text = "distributed" if self.args.get_distributed() else "" + self.logger.debug(f"Finished loading '{dist_loader_text}' CIFAR100 train data") + + return train_data + + def load_test_dataset(self): + self.logger.debug("Loading CIFAR100 test data") + + normalize = transforms.Normalize(mean=[0.507, 0.487, 0.441], std=[0.267, 0.256, 0.276]) + transform = transforms.Compose([ + transforms.ToTensor(), + normalize + ]) + test_dataset = datasets.CIFAR100(root=self.get_args().get_data_path(), train=False, download=True, + transform=transform) + sampler = get_sampler(self.test_dataset, self.args) + test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), sampler=sampler) + self.args.set_sampler(sampler) + + test_data = self.get_tuple_from_data_loader(test_loader) + + self.logger.debug("Finished loading CIFAR10 test data") + + return test_data + diff --git a/fltk/datasets/distributed/dataset.py b/fltk/datasets/distributed/dataset.py new file mode 100644 index 00000000..83f4cf31 --- /dev/null +++ b/fltk/datasets/distributed/dataset.py @@ -0,0 +1,141 @@ +from abc import abstractmethod +from torch.utils.data import DataLoader +from torch.utils.data import TensorDataset +import torch +import numpy + +from fltk.util.arguments import Arguments +from fltk.util.log import getLogger + + +class DistDataset: + + train_sampler = None + test_sampler = None + train_dataset = None + test_dataset = None + train_loader = None + test_loader = None + logger = getLogger(__name__) + def __init__(self, args: Arguments): + self.args = args + # self.train_dataset = self.load_train_dataset() + # self.test_dataset = self.load_test_dataset() + + def get_args(self): + """ + Returns the arguments. + + :return: Arguments + """ + return self.args + + # def get_train_dataset(self): + # """ + # Returns the train dataset. + # + # :return: tuple + # """ + # return self.train_dataset + # + # def get_test_dataset(self): + # """ + # Returns the test dataset. + # + # :return: tuple + # """ + # return self.test_dataset + + def get_train_loader(self): + return self.train_loader + + def get_test_loader(self): + return self.test_loader + + def get_train_sampler(self): + return self.train_sampler + + def get_test_sampler(self): + return self.test_sampler + + @abstractmethod + def init_train_dataset(self): + raise NotImplementedError("load_train_dataset() isn't implemented") + + @abstractmethod + def init_test_dataset(self): + raise NotImplementedError("load_train_dataset() isn't implemented") + + # @abstractmethod + # def load_train_dataset(self): + # """ + # Loads & returns the training dataset. + # + # :return: tuple + # """ + # raise NotImplementedError("load_train_dataset() isn't implemented") + # + # @abstractmethod + # def load_test_dataset(self): + # """ + # Loads & returns the test dataset. + # + # :return: tuple + # """ + # raise NotImplementedError("load_test_dataset() isn't implemented") + + # def get_train_loader(self, batch_size, **kwargs): + # """ + # Return the data loader for the train dataset. + # + # :param batch_size: batch size of data loader + # :type batch_size: int + # :return: torch.utils.data.DataLoader + # """ + # return Dataset.get_data_loader_from_data(batch_size, self.train_dataset[0], self.train_dataset[1], **kwargs) + # + # def get_test_loader(self, batch_size, **kwargs): + # """ + # Return the data loader for the test dataset. + # + # :param batch_size: batch size of data loader + # :type batch_size: int + # :return: torch.utils.data.DataLoader + # """ + # return Dataset.get_data_loader_from_data(batch_size, self.test_dataset[0], self.test_dataset[1], **kwargs) + # + # @staticmethod + # def get_data_loader_from_data(batch_size, X, Y, **kwargs): + # """ + # Get a data loader created from a given set of data. + # + # :param batch_size: batch size of data loader + # :type batch_size: int + # :param X: data features + # :type X: numpy.Array() + # :param Y: data labels + # :type Y: numpy.Array() + # :return: torch.utils.data.DataLoader + # """ + # X_torch = torch.from_numpy(X).float() + # + # if "classification_problem" in kwargs and kwargs["classification_problem"] == False: + # Y_torch = torch.from_numpy(Y).float() + # else: + # Y_torch = torch.from_numpy(Y).long() + # dataset = TensorDataset(X_torch, Y_torch) + # + # kwargs.pop("classification_problem", None) + # + # return DataLoader(dataset, batch_size=batch_size, **kwargs) + # + # @staticmethod + # def get_tuple_from_data_loader(data_loader): + # """ + # Get a tuple representation of the data stored in a data loader. + # + # :param data_loader: data loader to get data from + # :type data_loader: torch.utils.data.DataLoader + # :return: tuple + # """ + # return (next(iter(data_loader))[0].numpy(), next(iter(data_loader))[1].numpy()) diff --git a/fltk/datasets/distributed/fashion_mnist.py b/fltk/datasets/distributed/fashion_mnist.py new file mode 100644 index 00000000..770f6b66 --- /dev/null +++ b/fltk/datasets/distributed/fashion_mnist.py @@ -0,0 +1,55 @@ +from fltk.datasets.distributed import DistDataset +from torchvision import datasets +from torchvision import transforms +from torch.utils.data import DataLoader, DistributedSampler + +from fltk.samplers import get_sampler + + +class DistFashionMNISTDataset(DistDataset): + + def __init__(self, args): + super(DistFashionMNISTDataset, self).__init__(args) + self.init_train_dataset() + self.init_test_dataset() + + def init_train_dataset(self): + dist_loader_text = "distributed" if self.args.get_distributed() else "" + self.logger.debug(f"Loading '{dist_loader_text}' Fashion MNIST train data") + + self.train_dataset = datasets.FashionMNIST(root=self.get_args().get_data_path(), train=True, download=True, + transform=transforms.Compose([transforms.ToTensor()])) + self.train_sampler = get_sampler(self.train_dataset, self.args) + self.train_loader = DataLoader(self.train_dataset, batch_size=16, sampler=self.train_sampler) + + def init_test_dataset(self): + dist_loader_text = "distributed" if self.args.get_distributed() else "" + self.logger.debug(f"Loading '{dist_loader_text}' Fashion MNIST test data") + self.test_dataset = datasets.FashionMNIST(root=self.get_args().get_data_path(), train=False, download=True, + transform=transforms.Compose([transforms.ToTensor()])) + self.test_sampler = get_sampler(self.test_dataset, self.args) + self.test_loader = DataLoader(self.test_dataset, batch_size=16, sampler=self.test_sampler) + + def load_train_dataset(self): + self.logger.debug("Loading Fashion MNIST train data") + + train_dataset = datasets.FashionMNIST(self.get_args().get_data_path(), train=True, download=True, transform=transforms.Compose([transforms.ToTensor()])) + train_loader = DataLoader(train_dataset, batch_size=len(train_dataset)) + + train_data = self.get_tuple_from_data_loader(train_loader) + + self.logger.debug("Finished loading Fashion MNIST train data") + + return train_data + + def load_test_dataset(self): + self.logger.debug("Loading Fashion MNIST test data") + + test_dataset = datasets.FashionMNIST(self.get_args().get_data_path(), train=False, download=True, transform=transforms.Compose([transforms.ToTensor()])) + test_loader = DataLoader(test_dataset, batch_size=len(test_dataset)) + + test_data = self.get_tuple_from_data_loader(test_loader) + + self.logger.debug("Finished loading Fashion MNIST test data") + + return test_data diff --git a/fltk/datasets/distributed/mnist.py b/fltk/datasets/distributed/mnist.py new file mode 100644 index 00000000..a4056a3c --- /dev/null +++ b/fltk/datasets/distributed/mnist.py @@ -0,0 +1,123 @@ +from __future__ import annotations +from fltk.datasets import DistDataset +from torchvision import datasets, transforms +from torch.utils.data import DataLoader +# from fltk.strategy import get_sampler, get_augmentations, get_augmentations_tensor, UnifyingSampler +from random import choice +from PIL import Image + + +# typing: +from typing import TYPE_CHECKING, Tuple, Any, List + +from fltk.samplers import get_sampler + +if TYPE_CHECKING: + from fltk.util import BareConfig + +# class MNIST(datasets.MNIST): +# def __init__(self, root:str, transform, augment:bool=False): +# super().__init__(root=root, train=True, download=True, transform=transform) +# if augment: +# self.augmentation_transforms = get_augmentations() +# self.tensor_augmentations = get_augmentations_tensor() +# +# def __getitem__(self, index: int) -> Tuple[Any, Any]: +# augment = False +# if isinstance(index, str): +# target = int(index) +# index = choice(self.ordedered_by_label[target]) +# augment = True +# +# img, target = self.data[index], int(self.targets[index]) +# +# img = img.numpy() +# if augment: +# img = self.augmentation_transforms(image=img)['image'] +# img = Image.fromarray(img, mode='L') +# img = self.tensor_augmentations(img) +# +# if self.transform is not None: +# img = self.transform(img) +# +# return img, target +# +# def set_available_indices(self, ordedered_by_label:List[int]): +# self.ordedered_by_label = ordedered_by_label +# +# class DistMNISTDataset_2(DistDataset): +# +# def __init__(self, args:BareConfig): +# super(DistMNISTDataset_2, self).__init__(args) +# self.augment = args.augment +# self.augmented_emd = args.augmented_emd +# self.init_train_dataset(args) +# self.init_test_dataset() +# +# def init_train_dataset(self, args:BareConfig): +# dist_loader_text = "distributed" if self.args.get_distributed() else "" +# self.get_args().get_logger().debug(f"Loading '{dist_loader_text}' MNIST train data") +# +# self.train_dataset = MNIST(root=self.get_args().get_data_path(), transform=transforms.ToTensor(), augment=self.augment) +# self.train_sampler = get_sampler(self.train_dataset, self.args) +# self.train_dataset.set_available_indices(self.train_sampler.order_by_label(self.train_dataset)) +# if self.augment: +# self.train_sampler = UnifyingSampler(self.train_dataset, args, self.train_sampler, self.augmented_emd) +# self.train_loader = DataLoader(self.train_dataset, batch_size=16, sampler=self.train_sampler) +# +# def init_test_dataset(self): +# dist_loader_text = "distributed" if self.args.get_distributed() else "" +# self.get_args().get_logger().debug(f"Loading '{dist_loader_text}' MNIST test data") +# self.test_dataset = datasets.MNIST(root=self.get_args().get_data_path(), train=False, download=True, +# transform=transforms.Compose([transforms.ToTensor()])) +# self.test_sampler = get_sampler(self.test_dataset, self.args) +# self.test_loader = DataLoader(self.test_dataset, batch_size=16, sampler=self.test_sampler) + + +class DistMNISTDataset(DistDataset): + + def __init__(self, args): + super(DistMNISTDataset, self).__init__(args) + self.init_train_dataset() + self.init_test_dataset() + + def init_train_dataset(self): + dist_loader_text = "distributed" if self.args.get_distributed() else "" + self.logger.debug(f"Loading '{dist_loader_text}' MNIST train data") + + self.train_dataset = datasets.MNIST(root=self.get_args().get_data_path(), train=True, download=True, + transform=transforms.Compose([transforms.ToTensor()])) + self.train_sampler = get_sampler(self.train_dataset, self.args) + self.train_loader = DataLoader(self.train_dataset, batch_size=16, sampler=self.train_sampler) + + def init_test_dataset(self): + dist_loader_text = "distributed" if self.args.get_distributed() else "" + self.logger.debug(f"Loading '{dist_loader_text}' MNIST test data") + self.test_dataset = datasets.MNIST(root=self.get_args().get_data_path(), train=False, download=True, + transform=transforms.Compose([transforms.ToTensor()])) + self.test_sampler = get_sampler(self.test_dataset, self.args) + self.test_loader = DataLoader(self.test_dataset, batch_size=16, sampler=self.test_sampler) + + def load_train_dataset(self): + self.logger.debug("Loading MNIST train data") + + train_dataset = datasets.MNIST(self.get_args().get_data_path(), train=True, download=True, transform=transforms.Compose([transforms.ToTensor()])) + train_loader = DataLoader(train_dataset, batch_size=len(train_dataset)) + + train_data = self.get_tuple_from_data_loader(train_loader) + + self.logger.debug("Finished loading MNIST train data") + + return train_data + + def load_test_dataset(self): + self.logger.debug("Loading MNIST test data") + + test_dataset = datasets.MNIST(self.get_args().get_data_path(), train=False, download=True, transform=transforms.Compose([transforms.ToTensor()])) + test_loader = DataLoader(test_dataset, batch_size=len(test_dataset)) + + test_data = self.get_tuple_from_data_loader(test_loader) + + self.logger.debug("Finished loading MNIST test data") + + return test_data \ No newline at end of file diff --git a/fltk/datasets/loader_util.py b/fltk/datasets/loader_util.py new file mode 100644 index 00000000..1671882f --- /dev/null +++ b/fltk/datasets/loader_util.py @@ -0,0 +1,36 @@ +from fltk.datasets.distributed.mnist import DistMNISTDataset +from fltk.datasets.distributed.cifar10 import DistCIFAR10Dataset +from fltk.datasets.distributed.cifar100 import DistCIFAR100Dataset +from fltk.datasets.distributed.fashion_mnist import DistFashionMNISTDataset +from fltk.util.definitions import Dataset + +def available_datasets(): + return { + Dataset.cifar10: DistCIFAR10Dataset, + Dataset.cifar100: DistCIFAR100Dataset, + Dataset.fashion_mnist: DistFashionMNISTDataset, + Dataset.mnist: DistMNISTDataset + } + +def get_dataset(name: Dataset): + return available_datasets()[name] + + +def get_train_loader_path(name: Dataset) -> str: + paths = { + Dataset.cifar10: 'data_loaders/cifar10/train_data_loader.pickle', + Dataset.fashion_mnist: 'data_loaders/fashion-mnist/train_data_loader.pickle', + Dataset.cifar100: 'data_loaders/cifar100/train_data_loader.pickle', + Dataset.mnist: 'data_loaders/mnist/train_data_loader.pickle', + } + return paths[name] + + +def get_test_loader_path(name: Dataset)-> str: + paths = { + Dataset.cifar10: 'data_loaders/cifar10/test_data_loader.pickle', + Dataset.fashion_mnist: 'data_loaders/fashion-mnist/test_data_loader.pickle', + Dataset.cifar100: 'data_loaders/cifar100/test_data_loader.pickle', + Dataset.mnist: 'data_loaders/mnist/test_data_loader.pickle', + } + return paths[name] \ No newline at end of file diff --git a/fltk/launch.py b/fltk/launch.py index bef1791d..bade14e4 100644 --- a/fltk/launch.py +++ b/fltk/launch.py @@ -6,12 +6,12 @@ import torch.distributed as dist from kubernetes import config -from fltk.client import Client -from fltk.extractor import download_datasets -from fltk.orchestrator import Orchestrator +from fltk.core.distributed.client import Client +from fltk.core.distributed.extractor import download_datasets +from fltk.core.distributed.orchestrator import Orchestrator from fltk.util.cluster.client import ClusterManager from fltk.util.config.arguments import LearningParameters -from fltk.util.config.base_config import BareConfig +from fltk.util.config import DistributedConfig from fltk.util.task.generator.arrival_generator import ExperimentGenerator @@ -28,7 +28,7 @@ def should_distribute() -> bool: return dist.is_available() and world_size > 1 -def launch_client(task_id: str, config: BareConfig = None, learning_params: LearningParameters = None, +def launch_client(task_id: str, config: DistributedConfig = None, learning_params: LearningParameters = None, namespace: Namespace = None): """ @param task_id: String representation (should be unique) corresponding to a client. @@ -57,7 +57,7 @@ def launch_client(task_id: str, config: BareConfig = None, learning_params: Lear print(epoch_data) -def launch_orchestrator(args: Namespace = None, conf: BareConfig = None): +def launch_orchestrator(args: Namespace = None, conf: DistributedConfig = None): """ Default runner for the Orchestrator that is based on KubeFlow @param args: Commandline arguments passed to the execution. Might be removed in a future commit. @@ -98,7 +98,7 @@ def launch_orchestrator(args: Namespace = None, conf: BareConfig = None): logging.info("Stopped execution of Orchestrator...") -def launch_extractor(args: Namespace, conf: BareConfig): +def launch_extractor(args: Namespace, conf: DistributedConfig): """ Extractor launch function, will only download all models and quit execution. @param args: Arguments passed from CLI. diff --git a/fltk/nets/__init__.py b/fltk/nets/__init__.py index 8f9abe90..0c6da437 100644 --- a/fltk/nets/__init__.py +++ b/fltk/nets/__init__.py @@ -5,4 +5,35 @@ from .cifar_10_resnet import Cifar10ResNet, ResNet18, ResNet34, ResNet50, ResNet101, ResNet152 from .cifar_100_vgg import Cifar100VGG, vgg11_bn, vgg13_bn, vgg16_bn, vgg19_bn from .reddit_lstm import RNNModel -from.simple import SimpleMnist, SimpleNet +from .mnist_cnn import MNIST_CNN +from .simple import SimpleMnist, SimpleNet +from ..util.definitions import Nets + + +def available_nets(): + return { + Nets.cifar100_resnet: Cifar100ResNet, + Nets.cifar100_vgg: Cifar100VGG, + Nets.cifar10_cnn: Cifar10CNN, + Nets.cifar10_resnet: Cifar10ResNet, + Nets.fashion_mnist_cnn: FashionMNISTCNN, + Nets.fashion_mnist_resnet: FashionMNISTResNet, + Nets.mnist_cnn: MNIST_CNN, + + } + +def get_net(name: Nets): + return available_nets()[name] + + +def get_net_split_point(name: Nets): + nets_split_point = { + Nets.cifar100_resnet: 48, + Nets.cifar100_vgg: 28, + Nets.cifar10_cnn: 15, + Nets.cifar10_resnet: 39, + Nets.fashion_mnist_cnn: 7, + Nets.fashion_mnist_resnet: 7, + Nets.mnist_cnn: 2, + } + return nets_split_point[name] \ No newline at end of file diff --git a/fltk/nets/mnist_cnn.py b/fltk/nets/mnist_cnn.py new file mode 100644 index 00000000..5f4b69cd --- /dev/null +++ b/fltk/nets/mnist_cnn.py @@ -0,0 +1,20 @@ +import torch.nn as nn +import torch.nn.functional as F + +class MNIST_CNN(nn.Module): + def __init__(self): + super().__init__() + self.conv1 = nn.Conv2d(1, 10, kernel_size=5) + self.conv2 = nn.Conv2d(10, 20, kernel_size=5) + self.conv2_drop = nn.Dropout2d() + self.fc1 = nn.Linear(320, 50) + self.fc2 = nn.Linear(50, 10) + + def forward(self, x): + x = F.relu(F.max_pool2d(self.conv1(x), 2)) + x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) + x = x.view(-1, 320) + x = F.relu(self.fc1(x)) + x = F.dropout(x, training=self.training) + x = self.fc2(x) + return F.log_softmax(x) \ No newline at end of file diff --git a/fltk/nets/util/aggregration.py b/fltk/nets/util/aggregration.py index 7495b620..e69de29b 100644 --- a/fltk/nets/util/aggregration.py +++ b/fltk/nets/util/aggregration.py @@ -1,11 +0,0 @@ -def average_nn_parameters(parameters): - """ - Takes unweighted average of a list of Tensor weights. Averages passed parameters. - - :param parameters: nn model named parameters - :type parameters: list - """ - new_params = {} - for name in parameters[0].keys(): - new_params[name] = sum([param[name].data for param in parameters]) / len(parameters) - return new_params diff --git a/fltk/samplers/__init__.py b/fltk/samplers/__init__.py new file mode 100644 index 00000000..d808d4ff --- /dev/null +++ b/fltk/samplers/__init__.py @@ -0,0 +1,41 @@ +from .distributed_sampler import DistributedSamplerWrapper +from .uniform import UniformSampler +from .n_label import N_Labels +from .q_sampler import Probability_q_Sampler +from .dirichlet import DirichletSampler +from .limit_labels import LimitLabelsSampler +from .limit_labels_flex import LimitLabelsSamplerFlex +from ..util.definitions import DataSampler +from ..util.log import getLogger + + +def get_sampler(dataset, args): + logger = getLogger(__name__) + sampler = None + if args.get_distributed(): + method = args.get_sampler() + logger.debug( + "Using {} sampler method, with args: {}".format(method, args.get_sampler_args())) + + if method == DataSampler.uniform: + sampler = UniformSampler(dataset, num_replicas=args.get_world_size(), rank=args.get_rank()) + elif method == DataSampler.q_sampler: + sampler = Probability_q_Sampler(dataset, num_replicas=args.get_world_size(), rank=args.get_rank(), + args=args.get_sampler_args()) + elif method == DataSampler.limit_labels: + sampler = LimitLabelsSampler(dataset, num_replicas=args.get_world_size(), rank=args.get_rank(), + args=args.get_sampler_args()) + elif method == DataSampler.limit_labels_flex: + sampler = LimitLabelsSamplerFlex(dataset, num_replicas=args.get_world_size(), rank=args.get_rank(), + args=args.get_sampler_args()) + elif method == DataSampler.n_labels: + sampler = N_Labels(dataset, num_replicas=args.get_world_size(), rank=args.get_rank(), + args=args.get_sampler_args()) + elif method == DataSampler.dirichlet: + sampler = DirichletSampler(dataset, num_replicas=args.get_world_size(), rank=args.get_rank(), + args=args.get_sampler_args()) + else: # default + logger.warning("Unknown sampler " + method + ", using uniform instead") + sampler = UniformSampler(dataset, num_replicas=args.get_world_size(), rank=args.get_rank()) + + return sampler diff --git a/fltk/samplers/dirichlet.py b/fltk/samplers/dirichlet.py new file mode 100644 index 00000000..4aac9545 --- /dev/null +++ b/fltk/samplers/dirichlet.py @@ -0,0 +1,44 @@ +from fltk.samplers import DistributedSamplerWrapper +from torch.utils.data import DistributedSampler, Dataset +import numpy as np +import logging +import random +from collections import Counter + + +class DirichletSampler(DistributedSamplerWrapper): + """ Generates a (non-iid) data distribution by sampling the dirichlet distribution. Dirichlet constructs a + vector of length num_clients, that sums to one. Decreasing alpha results in a more non-iid data set. + This distribution method results in both label and quantity skew. + """ + def __init__(self, dataset: Dataset, num_replicas = None, + rank = None, args = (0.5, 42)) -> None: + alpha, seed = args + super().__init__(dataset, num_replicas=num_replicas, rank=rank, seed=seed) + + np.random.seed(seed) + indices = [] + ordered_by_label = self.order_by_label(dataset) + for labels in ordered_by_label: + n_samples = len(labels) + # generate an allocation by sampling dirichlet, which results in how many samples each client gets + allocation = np.random.dirichlet([alpha] * self.n_clients) * n_samples + allocation = allocation.astype(int) + start_index = allocation[0:self.client_id].sum() + end_index = 0 + if self.client_id + 1 == self.n_clients: # last client + end_index = n_samples + else: + end_index = start_index + allocation[self.client_id] + + selection = labels[start_index:end_index] + indices.extend(selection) + + labels = [dataset.targets[i] for i in indices] + logging.info("nr of samplers in client with rank {}: {}".format(rank, len(indices))) + logging.info("distribution in client with rank {}: {}".format(rank, Counter(labels))) + + random.seed(seed + self.client_id) # give each client a unique shuffle + random.shuffle(indices) # shuffle indices to spread the labels + + self.indices = indices \ No newline at end of file diff --git a/fltk/samplers/distributed_sampler.py b/fltk/samplers/distributed_sampler.py new file mode 100644 index 00000000..21d6a652 --- /dev/null +++ b/fltk/samplers/distributed_sampler.py @@ -0,0 +1,59 @@ +import random +import logging +from torch.utils.data import DistributedSampler, Dataset +from typing import Iterator +import numpy as np + + +class DistributedSamplerWrapper(DistributedSampler): + indices = [] + epoch_size = 1.0 + def __init__(self, dataset: Dataset, num_replicas = None, + rank = None, seed = 0) -> None: + super().__init__(dataset, num_replicas=num_replicas, rank=rank) + + self.client_id = rank - 1 + self.n_clients = num_replicas - 1 + self.n_labels = len(dataset.classes) + self.seed = seed + + + def order_by_label(self, dataset): + # order the indices by label + ordered_by_label = [[] for i in range(len(dataset.classes))] + for index, target in enumerate(dataset.targets): + ordered_by_label[target].append(index) + + return ordered_by_label + + def set_epoch_size(self, epoch_size: float) -> None: + """ Sets the epoch size as relative to the local amount of data. + 1.5 will result in the __iter__ function returning the available + indices with half appearing twice. + + Args: + epoch_size (float): relative size of epoch + """ + self.epoch_size = epoch_size + + def __iter__(self) -> Iterator[int]: + random.seed(self.rank+self.epoch) + epochs_todo = self.epoch_size + indices = [] + while(epochs_todo > 0.0): + random.shuffle(self.indices) + if epochs_todo >= 1.0: + indices.extend(self.indices) + else: + end_index = int(round(len(self.indices)*epochs_todo)) + indices.extend(self.indices[:end_index]) + + epochs_todo = epochs_todo - 1 + + ratio = len(indices)/float(len(self.indices)) + np.testing.assert_almost_equal(ratio, self.epoch_size, decimal=2) + + return iter(indices) + + def __len__(self) -> int: + return len(self.indices) \ No newline at end of file diff --git a/fltk/samplers/limit_labels.py b/fltk/samplers/limit_labels.py new file mode 100644 index 00000000..a05039fd --- /dev/null +++ b/fltk/samplers/limit_labels.py @@ -0,0 +1,91 @@ +from fltk.samplers import DistributedSamplerWrapper +from torch.utils.data import DistributedSampler, Dataset +import numpy as np +import logging +import random +from collections import Counter + +class LimitLabelsSampler(DistributedSamplerWrapper): + """ + A sampler that limits the number of labels per client + """ + + def __init__(self, dataset, num_replicas, rank, args=(5, 42)): + limit, seed = args + super().__init__(dataset, num_replicas, rank, seed) + + if self.n_clients % self.n_labels != 0: + logging.error( + "multiples of {} clients are needed for the 'limiting-labels' data distribution method, {} does not work".format( + self.n_labels, self.n_clients)) + return + + n_occurrences = limit * int(self.n_clients / self.n_labels) # number of occurrences of each label + counters = [n_occurrences] * self.n_clients # keeps track of which labels still can be given out + labels = list(range(self.n_labels)) # list of labels to distribute + clients = list(range(self.n_clients)) # keeps track of which clients should still be given a label + client_labels = [set() for n in range(self.n_clients)] # set of labels given to each client + random.seed(seed) # seed, such that the same result can be obtained multiple times + + while labels: + # pick a random label + label = random.choice(labels) + counters[label] -= 1 # decrement counter of this label + if counters[label] == 0: # if needed, remove label + labels.remove(label) + + # check which clients the label can be given to + selectable = [i for i in clients if not label in client_labels[i]] + client = None + + if not selectable: + # poor choice, let's fix this -> swap two labels + # conditions for swapping: + # sets of labels A, B, with B incomplete, remaining label l that is not possible to give to B, s.t.: + # (1) l not in A + # (2) exists label l' in A but not in B + # l, l' can be swapped + + client = random.choice(clients) # label can not be given to this client + for c, s in enumerate(client_labels): + if len(s) == limit: # this a completed set + if label not in s: # label can be given to this client (1) + subset = s.difference(client_labels[client]) # remove labels client already has (2...) + if subset: # subset is not empty (2 continued): + l = min(subset) # get a swappable label (in a deterministic way), and swap labels + client_labels[c].remove(l) + client_labels[c].add(label) + client_labels[client].add(l) + break + else: # normal operation, pick a rondom selectable client + client = random.choice(selectable) + client_labels[client].add(label) + + # check if this client has been given the maximum number of labels + if len(client_labels[client]) == limit: + clients.remove(client) + + # now we have a set of labels for each client + # client with rank=rank now needs to be given data + # all clients get the same amount of data, the first portion is given to client with rank 1, the second to rank 2, etc + + labels = client_labels[self.client_id] + logging.info("Client {} gets labels {}".format(self.rank, client_labels[self.client_id])) + indices = [] + ordered_by_label = self.order_by_label(dataset) + for label in labels: + n_samples = int(len(ordered_by_label[label]) / n_occurrences) + clients = [c for c, s in enumerate(client_labels) if label in s] # find out which clients have this label + index = clients.index(self.client_id) # find the position of this client + start_index = index * n_samples # inclusive + if rank == self.n_clients: + end_index = len(ordered_by_label[label]) # exclusive + else: + end_index = start_index + n_samples # exclusive + + indices += ordered_by_label[label][start_index:end_index] + + random.seed(seed + self.client_id) # give each client a unique shuffle + random.shuffle(indices) # shuffle indices to spread the labels + + self.indices = indices \ No newline at end of file diff --git a/fltk/samplers/limit_labels_flex.py b/fltk/samplers/limit_labels_flex.py new file mode 100644 index 00000000..d6dc659c --- /dev/null +++ b/fltk/samplers/limit_labels_flex.py @@ -0,0 +1,61 @@ +from fltk.samplers import DistributedSamplerWrapper +from torch.utils.data import DistributedSampler, Dataset +import numpy as np +import logging +import random +from collections import Counter + + +class LimitLabelsSamplerFlex(DistributedSamplerWrapper): + """ + A sampler that limits the number of labels per client + The number of clients must <= than number of labels + """ + + def __init__(self, dataset, num_replicas, rank, args=(5, 42)): + limit, seed = args + super().__init__(dataset, num_replicas, rank, seed) + + labels_per_client = int(np.floor(self.n_labels / self.n_clients)) + remaining_labels = self.n_labels - labels_per_client + labels = list(range(self.n_labels)) # list of labels to distribute + clients = list(range(self.n_clients)) # keeps track of which clients should still be given a label + client_labels = [set() for n in range(self.n_clients)] # set of labels given to each client + random.seed(seed) # seed, such that the same result can be obtained multiple times + print(client_labels) + + label_order = random.sample(labels, len(labels)) + client_label_dict = {} + for client_id in clients: + client_label_dict[client_id] = [] + for _ in range(labels_per_client): + chosen_label = label_order.pop() + client_label_dict[client_id].append(chosen_label) + client_labels[client_id].add(chosen_label) + client_label_dict['rest'] = label_order + + indices = [] + ordered_by_label = self.order_by_label(dataset) + labels = client_label_dict[self.client_id] + for label in labels: + n_samples = int(len(ordered_by_label[label])) + clients = [c for c, s in enumerate(client_labels) if label in s] # find out which clients have this label + index = clients.index(self.client_id) # find the position of this client + start_index = index * n_samples # inclusive + if rank == self.n_clients: + end_index = len(ordered_by_label[label]) # exclusive + else: + end_index = start_index + n_samples # exclusive + + indices += ordered_by_label[label][start_index:end_index] + + # Last part is uniform sampler + rest_indices = [] + for l in client_label_dict['rest']: + rest_indices += ordered_by_label[l] + filtered_rest_indices = rest_indices[self.rank:self.total_size:self.num_replicas] + indices += filtered_rest_indices + random.seed(seed + self.client_id) # give each client a unique shuffle + random.shuffle(indices) # shuffle indices to spread the labels + + self.indices = indices \ No newline at end of file diff --git a/fltk/samplers/n_label.py b/fltk/samplers/n_label.py new file mode 100644 index 00000000..4b00c1a9 --- /dev/null +++ b/fltk/samplers/n_label.py @@ -0,0 +1,174 @@ +from fltk.samplers import DistributedSamplerWrapper +from torch.utils.data import DistributedSampler, Dataset +import numpy as np +import logging +import random +from collections import Counter + + +class N_Labels(DistributedSamplerWrapper): + """ + A sampler that limits the number of labels per client + The number of clients must <= than number of labels + """ + + def __init__(self, dataset, num_replicas, rank, args=(5, 42)): + limit, seed = args + super().__init__(dataset, num_replicas, rank, seed) + + num_copies = np.ceil((args[0] * self.n_clients) / self.n_labels) + label_dict = {} + for l in range(self.n_labels): + label_dict[l] = num_copies + + def get_least_used_labels(l_dict: dict): + label_list = [[k, v] for k, v in label_dict.items()] + label_list[-1][1] = 0 + sorted_list = sorted(label_list, key=lambda x: x[1], reverse=True) + # print('d') + # label_list.sort(lambda x:x) + + def choice_n(l_dict: dict, n, seed_offset = 0): + # get_least_used_labels(l_dict) + labels = [k for k, v in label_dict.items() if v] + # summed = sum([int(v) for k, v in label_dict.items() if v]) + # amounts = [float(v) / float(summed) for k, v in label_dict.items() if v] + # # p = amounts / summed + print(f'Available labels: {labels} choose {n}') + # # np.random.seed(seed + seed_offset) + # # @TODO: Error is in this section! + # print(f'n={n}, labels={labels}, p={amounts}') + # print(amounts) + + selected = np.random.choice(labels, n, replace=False) + # print(selected) + for k, v in l_dict.items(): + if k in selected: + # v -= 1 + l_dict[k] -= 1 + return selected + + + # print(f'N Clients={self.n_clients}') + # print(f'Num_buckets={num_copies}') + + clients = list(range(self.n_clients)) # keeps track of which clients should still be given a label + client_label_dict = {} + ordered_list = list(range(self.n_labels)) * int(num_copies) + + # Old code + # for idx, client_id in enumerate(clients): + # # client_label_dict[client_id] = [] + # label_set = choice_n(label_dict, args[0], idx) + # client_label_dict[client_id] = label_set + + # Now code + for idx, client_id in enumerate(clients): + label_set = [] + for _ in range(args[0]): + label_set.append(ordered_list.pop()) + client_label_dict[client_id] = label_set + + client_label_dict['rest'] = [] + # New code + if len(ordered_list): + client_label_dict['rest'] = ordered_list + + # Old code + # client_label_dict['rest'] = labels = [k for k, v in label_dict.items() if v] + # for k, v in label_dict.items(): + # for x in range(int(v)): + # client_label_dict['rest'].append(int(k)) + + # Order data by label; split into N buckets and select indices based on the order found in the client-label-dict + + reverse_label_dict = {} + for l in range(self.n_labels): + reverse_label_dict[l] = [] + + for k, v in client_label_dict.items(): + # print(f'client {k} has labels {v}') + for l_c in v: + reverse_label_dict[l_c].append(k) + + indices = [] + ordered_by_label = self.order_by_label(dataset) + indices_per_client = {} + for c in clients: + indices_per_client[c] = [] + + rest_indices = [] + for group, label_list in enumerate(ordered_by_label): + splitted = np.array_split(label_list, num_copies) + client_id_to_distribute = reverse_label_dict[group] + for split_part in splitted: + client_key = client_id_to_distribute.pop() + if client_key == 'rest': + rest_indices.append(split_part) + else: + indices_per_client[client_key].append(split_part) + # for split_part in splitted: + # @TODO: Fix this part in terms of code cleanness. Could be written more cleanly + if len(rest_indices): + rest_indices = np.concatenate(rest_indices) + rest_splitted = np.array_split(rest_indices, len(indices_per_client)) + + for k, v in indices_per_client.items(): + v.append(rest_splitted.pop()) + indices_per_client[k] = np.concatenate(v) + else: + rest_indices = np.ndarray([]) + for k, v in indices_per_client.items(): + indices_per_client[k] = np.concatenate(v) + + indices = indices_per_client[self.client_id] + random.seed(seed + self.client_id) # give each client a unique shuffle + random.shuffle(indices) # shuffle indices to spread the labels + + self.indices = indices + + # labels_per_client = int(np.floor(self.n_labels / self.n_clients)) + # remaining_labels = self.n_labels - labels_per_client + # labels = list(range(self.n_labels)) # list of labels to distribute + # clients = list(range(self.n_clients)) # keeps track of which clients should still be given a label + # client_labels = [set() for n in range(self.n_clients)] # set of labels given to each client + # random.seed(seed) # seed, such that the same result can be obtained multiple times + # print(client_labels) + # + # label_order = random.sample(labels, len(labels)) + # client_label_dict = {} + # for client_id in clients: + # client_label_dict[client_id] = [] + # for _ in range(labels_per_client): + # chosen_label = label_order.pop() + # client_label_dict[client_id].append(chosen_label) + # client_labels[client_id].add(chosen_label) + # client_label_dict['rest'] = label_order + # + # + # + # indices = [] + # ordered_by_label = self.order_by_label(dataset) + # labels = client_label_dict[self.client_id] + # for label in labels: + # n_samples = int(len(ordered_by_label[label])) + # clients = [c for c, s in enumerate(client_labels) if label in s] # find out which clients have this label + # index = clients.index(self.client_id) # find the position of this client + # start_index = index * n_samples # inclusive + # if rank == self.n_clients: + # end_index = len(ordered_by_label[label]) # exclusive + # else: + # end_index = start_index + n_samples # exclusive + # + # indices += ordered_by_label[label][start_index:end_index] + # + # # Last part is uniform sampler + # rest_indices = [] + # for l in client_label_dict['rest']: + # rest_indices += ordered_by_label[l] + # filtered_rest_indices = rest_indices[self.rank:self.total_size:self.num_replicas] + # indices += filtered_rest_indices + # random.seed(seed + self.client_id) # give each client a unique shuffle + # random.shuffle(indices) # shuffle indices to spread the labels + # + # self.indices = indices \ No newline at end of file diff --git a/fltk/samplers/q_sampler.py b/fltk/samplers/q_sampler.py new file mode 100644 index 00000000..77d38f0a --- /dev/null +++ b/fltk/samplers/q_sampler.py @@ -0,0 +1,57 @@ +from fltk.samplers import DistributedSamplerWrapper +from torch.utils.data import DistributedSampler, Dataset +import numpy as np +import logging +import random +from collections import Counter + + +class Probability_q_Sampler(DistributedSamplerWrapper): + """ + Clients are divided among M groups, with M being the number of labels. + A sample with label m is than given to a member of group m with probability q, + and to any other group with probability (1-q)/(m-1) + + side effect of this method is that the reported loss on the test dataset becomes somewhat meaningless...logging.info("distribution in client with rank {}: {}".format(rank, Counter(labels))) + """ + + def __init__(self, dataset, num_replicas, rank, args=(0.5, 42)): + q, seed = args + super().__init__(dataset, num_replicas, rank, seed) + + if self.n_clients % self.n_labels != 0: + logging.error( + "multiples of {} clients are needed for the 'probability-q-sampler' data distribution method, {} does not work".format( + self.n_labels, self.n_clients)) + return + + # divide data among groups + counter = 0 # for dividing data within a group + group_id = self.client_id % self.n_labels + group_clients = [client for client in range(self.n_clients) if client % self.n_labels == group_id] + indices = [] + random.seed(seed) + ordered_by_label = self.order_by_label(dataset) + for group, label_list in enumerate(ordered_by_label): + for sample_idx in label_list: + rnd_val = random.random() + if rnd_val < q: + if group == group_id: + if group_clients[counter] == self.client_id: + indices.append(sample_idx) + counter = (counter + 1) % len(group_clients) + else: + others = [grp for grp in range(self.n_labels) if grp != group] + if random.choice(others) == group_id: + if group_clients[counter] == self.client_id: + indices.append(sample_idx) + counter = (counter + 1) % len(group_clients) + + labels = [dataset.targets[i] for i in indices] + logging.info("nr of samplers in client with rank {}: {}".format(rank, len(indices))) + logging.info("distribution in client with rank {}: {}".format(rank, Counter(labels))) + + random.seed(seed + self.client_id) # give each client a unique shuffle + random.shuffle(indices) # shuffle indices to spread the labels + + self.indices = indices \ No newline at end of file diff --git a/fltk/samplers/uniform.py b/fltk/samplers/uniform.py new file mode 100644 index 00000000..69e826f1 --- /dev/null +++ b/fltk/samplers/uniform.py @@ -0,0 +1,13 @@ +from fltk.samplers import DistributedSamplerWrapper +from torch.utils.data import DistributedSampler, Dataset +import numpy as np +import logging +import random +from collections import Counter + + +class UniformSampler(DistributedSamplerWrapper): + def __init__(self, dataset, num_replicas=None, rank=None, seed=0): + super().__init__(dataset, num_replicas=num_replicas, rank=rank, seed=seed) + indices = list(range(len(self.dataset))) + self.indices = indices[self.rank:self.total_size:self.n_clients] \ No newline at end of file diff --git a/fltk/strategy/__init__.py b/fltk/strategy/__init__.py index e69de29b..15884b28 100644 --- a/fltk/strategy/__init__.py +++ b/fltk/strategy/__init__.py @@ -0,0 +1,4 @@ +from .aggregation import * +from .client_selection import * +from .optimization import * +from .offloading import OffloadingStrategy, parse_strategy diff --git a/fltk/strategy/aggregation/FedAvg.py b/fltk/strategy/aggregation/FedAvg.py new file mode 100644 index 00000000..041f4628 --- /dev/null +++ b/fltk/strategy/aggregation/FedAvg.py @@ -0,0 +1,18 @@ + + +def fed_avg(parameters, sizes): + new_params = {} + sum_size = 0 + for client in parameters: + for name in parameters[client].keys(): + try: + new_params[name].data += (parameters[client][name].data * sizes[client]) + except: + new_params[name] = (parameters[client][name].data * sizes[client]) + sum_size += sizes[client] + + for name in new_params: + # @TODO: Is .long() really required? + new_params[name].data = new_params[name].data.long() / sum_size + + return new_params \ No newline at end of file diff --git a/fltk/strategy/aggregation/__init__.py b/fltk/strategy/aggregation/__init__.py new file mode 100644 index 00000000..fca94c72 --- /dev/null +++ b/fltk/strategy/aggregation/__init__.py @@ -0,0 +1,13 @@ +from fltk.util.definitions import Aggregations +from .FedAvg import fed_avg +from .aggregation import average_nn_parameters, average_nn_parameters_simple + + +def get_aggregation(name: Aggregations): + enum_type = Aggregations(name.value) + aggregations_dict = { + Aggregations.fedavg: fed_avg, + Aggregations.sum: lambda x: x, + Aggregations.avg: lambda x: x*2 + } + return aggregations_dict[enum_type] \ No newline at end of file diff --git a/fltk/strategy/aggregation.py b/fltk/strategy/aggregation/aggregation.py similarity index 52% rename from fltk/strategy/aggregation.py rename to fltk/strategy/aggregation/aggregation.py index f18ac1aa..14445939 100644 --- a/fltk/strategy/aggregation.py +++ b/fltk/strategy/aggregation/aggregation.py @@ -1,8 +1,23 @@ + + +def average_nn_parameters_simple(parameters): + """ + Averages passed parameters. + :param parameters: nn model named parameters + :type parameters: list + """ + new_params = {} + for name in parameters[0].keys(): + new_params[name] = sum([param[name].data for param in parameters]) / len(parameters) + + return new_params + + def average_nn_parameters(parameters): """ - @deprecated Average passed parameters. - @param parameters: nn model named parameters - @type parameters: list + Averages passed parameters. + :param parameters: nn model named parameters + :type parameters: list """ new_params = {} for name in parameters[0].keys(): @@ -11,13 +26,13 @@ def average_nn_parameters(parameters): return new_params -def fed_average_nn_parameters(parameters, sizes): +def average_nn_parameters(parameters, sizes): """ - @deprecated Federated Average passed parameters. - @param parameters: nn model named parameters - @type parameters: list - @param sizes: - @type sizes: + Federated Average passed parameters. + :param parameters: nn model named parameters + :type parameters: list + :param sizes: + :type sizes: """ new_params = {} sum_size = 0 diff --git a/fltk/strategy/client_selection/__init__.py b/fltk/strategy/client_selection/__init__.py new file mode 100644 index 00000000..f490a0da --- /dev/null +++ b/fltk/strategy/client_selection/__init__.py @@ -0,0 +1,2 @@ +from .random_selection import random_selection +from .tifl import tifl_select_tier, tifl_select_tier_and_decrement, tifl_can_select_tier, tifl_update_probs \ No newline at end of file diff --git a/fltk/strategy/client_selection.py b/fltk/strategy/client_selection/random_selection.py similarity index 99% rename from fltk/strategy/client_selection.py rename to fltk/strategy/client_selection/random_selection.py index 34900ce8..716665a5 100644 --- a/fltk/strategy/client_selection.py +++ b/fltk/strategy/client_selection/random_selection.py @@ -1,4 +1,5 @@ import numpy as np + def random_selection(clients, n): return np.random.choice(clients, n, replace=False) \ No newline at end of file diff --git a/fltk/strategy/client_selection/tifl.py b/fltk/strategy/client_selection/tifl.py new file mode 100644 index 00000000..20be7311 --- /dev/null +++ b/fltk/strategy/client_selection/tifl.py @@ -0,0 +1,31 @@ +import numpy as np + + +def tifl_select_tier(tiers): + print([x[3] for x in tiers]) + return np.random.choice([x[0] for x in tiers], 1, p=[x[3] for x in tiers])[0] + + +def tifl_update_probs(tiers): + n = len([x for x in tiers if x[2] > 0]) + D = n * (n +1) / 2 + tiers.sort(key=lambda x:x[1]) + idx_decr = 0 + for idx, tier in enumerate(tiers): + if tier[2] > 0: + tier[3] = (n - (idx - idx_decr)) / D + else: + tier[3] = 0 + idx_decr += 1 + + +def tifl_select_tier_and_decrement(tiers): + selected_tier = tifl_select_tier(tiers) + for tier in tiers: + if tier[0] == selected_tier: + tier[2] -= 1 + return selected_tier + + +def tifl_can_select_tier(tiers): + return len([x for x in tiers if x[2] > 0]) \ No newline at end of file diff --git a/fltk/strategy/data_samplers.py b/fltk/strategy/data_samplers.py index e452e14f..e69de29b 100644 --- a/fltk/strategy/data_samplers.py +++ b/fltk/strategy/data_samplers.py @@ -1,269 +0,0 @@ -import logging -import random -from collections import Counter -from typing import Iterator - -import numpy as np -from torch.utils.data import DistributedSampler, Dataset - - -class DistributedSamplerWrapper(DistributedSampler): - indices = [] - epoch_size = 1.0 - - def __init__(self, dataset: Dataset, num_replicas=None, - rank=None, seed=0) -> None: - super().__init__(dataset, num_replicas=num_replicas, rank=rank) - - self.client_id = rank - 1 - self.n_clients = num_replicas - 1 - self.n_labels = len(dataset.classes) - self.seed = seed - - def order_by_label(self, dataset): - # order the indices by label - ordered_by_label = [[] for i in range(len(dataset.classes))] - for index, target in enumerate(dataset.targets): - ordered_by_label[target].append(index) - - return ordered_by_label - - def set_epoch_size(self, epoch_size: float) -> None: - """ Sets the epoch size as relative to the local amount of data. - 1.5 will result in the __iter__ function returning the available - indices with half appearing twice. - - Args: - epoch_size (float): relative size of epoch - """ - self.epoch_size = epoch_size - - def __iter__(self) -> Iterator[int]: - random.seed(self.rank + self.epoch) - epochs_todo = self.epoch_size - indices = [] - while (epochs_todo > 0.0): - random.shuffle(self.indices) - if epochs_todo >= 1.0: - indices.extend(self.indices) - else: - end_index = int(round(len(self.indices) * epochs_todo)) - indices.extend(self.indices[:end_index]) - - epochs_todo = epochs_todo - 1 - - ratio = len(indices) / float(len(self.indices)) - np.testing.assert_almost_equal(ratio, self.epoch_size, decimal=2) - - return iter(indices) - - def __len__(self) -> int: - return len(self.indices) - - -class LimitLabelsSampler(DistributedSamplerWrapper): - """ - A sampler that limits the number of labels per client - """ - - def __init__(self, dataset, num_replicas, rank, args=(5, 42)): - limit, seed = args - super().__init__(dataset, num_replicas, rank, seed) - - if self.n_clients % self.n_labels != 0: - logging.error( - "multiples of {} clients are needed for the 'limiting-labels' data distribution method, {} does not work".format( - self.n_labels, self.n_clients)) - return - - n_occurrences = limit * int(self.n_clients / self.n_labels) # number of occurrences of each label - counters = [n_occurrences] * self.n_clients # keeps track of which labels still can be given out - labels = list(range(self.n_labels)) # list of labels to distribute - clients = list(range(self.n_clients)) # keeps track of which clients should still be given a label - client_labels = [set() for n in range(self.n_clients)] # set of labels given to each client - random.seed(seed) # seed, such that the same result can be obtained multiple times - - while labels: - # pick a random label - label = random.choice(labels) - counters[label] -= 1 # decrement counter of this label - if counters[label] == 0: # if needed, remove label - labels.remove(label) - - # check which clients the label can be given to - selectable = [i for i in clients if not label in client_labels[i]] - client = None - - if not selectable: - # poor choice, let's fix this -> swap two labels - # conditions for swapping: - # sets of labels A, B, with B incomplete, remaining label l that is not possible to give to B, s.t.: - # (1) l not in A - # (2) exists label l' in A but not in B - # l, l' can be swapped - - client = random.choice(clients) # label can not be given to this client - for c, s in enumerate(client_labels): - if len(s) == limit: # this a completed set - if label not in s: # label can be given to this client (1) - subset = s.difference(client_labels[client]) # remove labels client already has (2...) - if subset: # subset is not empty (2 continued): - l = min(subset) # get a swappable label (in a deterministic way), and swap labels - client_labels[c].remove(l) - client_labels[c].add(label) - client_labels[client].add(l) - break - else: # normal operation, pick a rondom selectable client - client = random.choice(selectable) - client_labels[client].add(label) - - # check if this client has been given the maximum number of labels - if len(client_labels[client]) == limit: - clients.remove(client) - - # now we have a set of labels for each client - # client with rank=rank now needs to be given data - # all clients get the same amount of data, the first portion is given to client with rank 1, the second to rank 2, etc - - labels = client_labels[self.client_id] - logging.info("Client {} gets labels {}".format(self.rank, client_labels[self.client_id])) - indices = [] - ordered_by_label = self.order_by_label(dataset) - for label in labels: - n_samples = int(len(ordered_by_label[label]) / n_occurrences) - clients = [c for c, s in enumerate(client_labels) if label in s] # find out which clients have this label - index = clients.index(self.client_id) # find the position of this client - start_index = index * n_samples # inclusive - if rank == self.n_clients: - end_index = len(ordered_by_label[label]) # exclusive - else: - end_index = start_index + n_samples # exclusive - - indices += ordered_by_label[label][start_index:end_index] - - random.seed(seed + self.client_id) # give each client a unique shuffle - random.shuffle(indices) # shuffle indices to spread the labels - - self.indices = indices - - -class Probability_q_Sampler(DistributedSamplerWrapper): - """ - Clients are divided among M groups, with M being the number of labels. - A sample with label m is than given to a member of group m with probability q, - and to any other group with probability (1-q)/(m-1) - - side effect of this method is that the reported loss on the test dataset becomes somewhat meaningless...logging.info("distribution in client with rank {}: {}".format(rank, Counter(labels))) - """ - - def __init__(self, dataset, num_replicas, rank, args=(0.5, 42)): - q, seed = args - super().__init__(dataset, num_replicas, rank, seed) - - if self.n_clients % self.n_labels != 0: - logging.error( - "multiples of {} clients are needed for the 'probability-q-sampler' data distribution method, {} does not work".format( - self.n_labels, self.n_clients)) - return - - # divide data among groups - counter = 0 # for dividing data within a group - group_id = self.client_id % self.n_labels - group_clients = [client for client in range(self.n_clients) if client % self.n_labels == group_id] - indices = [] - random.seed(seed) - ordered_by_label = self.order_by_label(dataset) - for group, label_list in enumerate(ordered_by_label): - for sample_idx in label_list: - rnd_val = random.random() - if rnd_val < q: - if group == group_id: - if group_clients[counter] == self.client_id: - indices.append(sample_idx) - counter = (counter + 1) % len(group_clients) - else: - others = [grp for grp in range(self.n_labels) if grp != group] - if random.choice(others) == group_id: - if group_clients[counter] == self.client_id: - indices.append(sample_idx) - counter = (counter + 1) % len(group_clients) - - labels = [dataset.targets[i] for i in indices] - logging.info("nr of samplers in client with rank {}: {}".format(rank, len(indices))) - logging.info("distribution in client with rank {}: {}".format(rank, Counter(labels))) - - random.seed(seed + self.client_id) # give each client a unique shuffle - random.shuffle(indices) # shuffle indices to spread the labels - - self.indices = indices - - -class DirichletSampler(DistributedSamplerWrapper): - """ Generates a (non-iid) data distribution by sampling the dirichlet distribution. Dirichlet constructs a - vector of length num_clients, that sums to one. Decreasing alpha results in a more non-iid data set. - This distribution method results in both label and quantity skew. - """ - - def __init__(self, dataset: Dataset, num_replicas=None, - rank=None, args=(0.5, 42)) -> None: - alpha, seed = args - super().__init__(dataset, num_replicas=num_replicas, rank=rank, seed=seed) - - np.random.seed(seed) - indices = [] - ordered_by_label = self.order_by_label(dataset) - for labels in ordered_by_label: - n_samples = len(labels) - # generate an allocation by sampling dirichlet, which results in how many samples each client gets - allocation = np.random.dirichlet([alpha] * self.n_clients) * n_samples - allocation = allocation.astype(int) - start_index = allocation[0:self.client_id].sum() - end_index = 0 - if self.client_id + 1 == self.n_clients: # last client - end_index = n_samples - else: - end_index = start_index + allocation[self.client_id] - - selection = labels[start_index:end_index] - indices.extend(selection) - - labels = [dataset.targets[i] for i in indices] - logging.info("nr of samplers in client with rank {}: {}".format(rank, len(indices))) - logging.info("distribution in client with rank {}: {}".format(rank, Counter(labels))) - - random.seed(seed + self.client_id) # give each client a unique shuffle - random.shuffle(indices) # shuffle indices to spread the labels - - self.indices = indices - - -class UniformSampler(DistributedSamplerWrapper): - def __init__(self, dataset, num_replicas=None, rank=None, seed=0): - super().__init__(dataset, num_replicas=num_replicas, rank=rank, seed=seed) - indices = list(range(len(self.dataset))) - self.indices = indices[self.rank:self.total_size:self.num_replicas] - - -def get_sampler(dataset, args): - sampler = None - if args.get_distributed(): - method = args.get_sampler() - args.get_logger().info( - "Using {} sampler method, with args: {}".format(method, args.get_sampler_args())) - - if method == "uniform": - sampler = UniformSampler(dataset, num_replicas=args.get_world_size(), rank=args.get_rank()) - elif method == "q sampler": - sampler = Probability_q_Sampler(dataset, num_replicas=args.get_world_size(), rank=args.get_rank(), - args=args.get_sampler_args()) - elif method == "limit labels": - sampler = LimitLabelsSampler(dataset, num_replicas=args.get_world_size(), rank=args.get_rank(), - args=args.get_sampler_args()) - elif method == "dirichlet": - sampler = DirichletSampler(dataset, num_replicas=args.get_world_size(), rank=args.get_rank(), - args=args.get_sampler_args()) - else: # default - args().get_logger().warning("Unknown sampler " + method + ", using uniform instead") - sampler = UniformSampler(dataset, num_replicas=args.get_world_size(), rank=args.get_rank()) - - return sampler diff --git a/fltk/strategy/offloading.py b/fltk/strategy/offloading.py new file mode 100644 index 00000000..3f39f3e2 --- /dev/null +++ b/fltk/strategy/offloading.py @@ -0,0 +1,97 @@ +from enum import Enum + + +class OffloadingStrategy(Enum): + VANILLA = 1 + DEADLINE = 2 + SWYH = 3 + FREEZE = 4 + MODEL_OFFLOAD = 5, + TIFL_BASIC = 6, + TIFL_ADAPTIVE = 7, + DYN_TERMINATE = 8, + DYN_TERMINATE_SWYH = 9, + MODEL_OFFLOAD_STRICT = 10, + MODEL_OFFLOAD_STRICT_SWYH = 11 + + @classmethod + def Parse(cls, string_value): + if string_value == 'vanilla': + return OffloadingStrategy.VANILLA + if string_value == 'deadline': + return OffloadingStrategy.DEADLINE + if string_value == 'swyh': + return OffloadingStrategy.SWYH + if string_value == 'freeze': + return OffloadingStrategy.FREEZE + if string_value == 'offload': + return OffloadingStrategy.MODEL_OFFLOAD + if string_value == 'tifl-basic': + return OffloadingStrategy.TIFL_BASIC + if string_value == 'tifl-adaptive': + return OffloadingStrategy.TIFL_ADAPTIVE + if string_value == 'dynamic-terminate': + return OffloadingStrategy.DYN_TERMINATE + if string_value == 'dynamic-terminate-swyh': + return OffloadingStrategy.DYN_TERMINATE_SWYH + if string_value == 'offload-strict': + return OffloadingStrategy.MODEL_OFFLOAD_STRICT + if string_value == 'offload-strict-swyh': + return OffloadingStrategy.MODEL_OFFLOAD_STRICT_SWYH + + +def parse_strategy(strategy: OffloadingStrategy): + deadline_enabled = False + swyh_enabled = False + freeze_layers_enabled = False + offload_enabled = False + dyn_terminate = False + dyn_terminate_swyh = False + if strategy == OffloadingStrategy.VANILLA: + deadline_enabled = False + swyh_enabled = False + freeze_layers_enabled = False + offload_enabled = False + if strategy == OffloadingStrategy.DEADLINE: + deadline_enabled = True + swyh_enabled = False + freeze_layers_enabled = False + offload_enabled = False + if strategy == OffloadingStrategy.SWYH: + deadline_enabled = True + swyh_enabled = True + freeze_layers_enabled = False + offload_enabled = False + if strategy == OffloadingStrategy.FREEZE: + deadline_enabled = True + swyh_enabled = False + freeze_layers_enabled = True + offload_enabled = False + if strategy == OffloadingStrategy.MODEL_OFFLOAD: + deadline_enabled = True + swyh_enabled = False + freeze_layers_enabled = True + offload_enabled = True + if strategy == OffloadingStrategy.DYN_TERMINATE: + deadline_enabled = False + swyh_enabled = False + freeze_layers_enabled = False + offload_enabled = False + dyn_terminate = True + if strategy == OffloadingStrategy.DYN_TERMINATE_SWYH: + deadline_enabled = False + swyh_enabled = False + freeze_layers_enabled = False + offload_enabled = False + dyn_terminate_swyh = True + if strategy == OffloadingStrategy.MODEL_OFFLOAD_STRICT: + deadline_enabled = True + swyh_enabled = True + freeze_layers_enabled = True + offload_enabled = True + if strategy == OffloadingStrategy.MODEL_OFFLOAD_STRICT_SWYH: + deadline_enabled = True + swyh_enabled = True + freeze_layers_enabled = True + offload_enabled = True + return deadline_enabled, swyh_enabled, freeze_layers_enabled, offload_enabled, dyn_terminate, dyn_terminate_swyh diff --git a/fltk/strategy/optimization/FedNova.py b/fltk/strategy/optimization/FedNova.py new file mode 100644 index 00000000..d51895e1 --- /dev/null +++ b/fltk/strategy/optimization/FedNova.py @@ -0,0 +1,184 @@ +import torch +import torch.distributed as dist +from torch.optim.optimizer import Optimizer, required + + +class FedNova(Optimizer): + r"""Implements federated normalized averaging (FedNova). + + Nesterov momentum is based on the formula from + `On the importance of initialization and momentum in deep learning`__. + + Args: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + ratio (float): relative sample size of client + gmf (float): global/server/slow momentum factor + mu (float): parameter for proximal local SGD + lr (float): learning rate + momentum (float, optional): momentum factor (default: 0) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + dampening (float, optional): dampening for momentum (default: 0) + nesterov (bool, optional): enables Nesterov momentum (default: False) + + Example: + >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9) + >>> optimizer.zero_grad() + >>> loss_fn(model(input), target).backward() + >>> optimizer.step() + + __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf + + .. note:: + The implementation of SGD with Momentum/Nesterov subtly differs from + Sutskever et. al. and implementations in some other frameworks. + + Considering the specific case of Momentum, the update can be written as + + .. math:: + v = \rho * v + g \\ + p = p - lr * v + + where p, g, v and :math:`\rho` denote the parameters, gradient, + velocity, and momentum respectively. + + This is in contrast to Sutskever et. al. and + other frameworks which employ an update of the form + + .. math:: + v = \rho * v + lr * g \\ + p = p - v + + The Nesterov version is analogously modified. + """ + + def __init__(self, params, lr=0.05, momentum=0.9, dampening=0, + weight_decay=0, nesterov=False, variance=0, mu=0): + self.momentum = momentum + self.mu = mu + self.ai_l1_norm = 0 + self.local_counter = 0 + self.local_steps = 0 + + + if lr is not required and lr < 0.0: + raise ValueError("Invalid learning rate: {}".format(lr)) + if momentum < 0.0: + raise ValueError("Invalid momentum value: {}".format(momentum)) + if weight_decay < 0.0: + raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) + + defaults = dict(lr=lr, momentum=momentum, dampening=dampening, + weight_decay=weight_decay, nesterov=nesterov, variance=variance) + + if nesterov and (momentum <= 0 or dampening != 0): + raise ValueError("Nesterov momentum requires a momentum and zero dampening") + super(FedNova, self).__init__(params, defaults) + + def __setstate__(self, state): + super(FedNova, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('nesterov', False) + + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + device = "cuda" if torch.cuda.is_available() else "cpu" + + loss = None + if closure is not None: + loss = closure() + + # scale = 1**self.itr + + for group in self.param_groups: + weight_decay = group['weight_decay'] + momentum = group['momentum'] + dampening = group['dampening'] + nesterov = group['nesterov'] + + for p in group['params']: + if p.grad is None: + continue + d_p = p.grad.data + + if weight_decay != 0: + d_p.add_(p.data, alpha=weight_decay) + + param_state = self.state[p] + if 'old_init' not in param_state: + param_state['old_init'] = torch.clone(p.data).detach() + + local_lr = group['lr'] + + # apply momentum updates + if momentum != 0: + if 'momentum_buffer' not in param_state: + buf = param_state['momentum_buffer'] = torch.clone(d_p).detach() + else: + buf = param_state['momentum_buffer'] + buf.mul_(momentum).add_(d_p, alpha=1 - dampening) + if nesterov: + d_p = d_p.add(momentum, buf) + else: + d_p = buf + + # apply proximal updates + if self.mu != 0: + d_p.add_(p.data - param_state['old_init'], alpha=self.mu) + + # update accumulated local updates + if 'cum_grad' not in param_state: + param_state['cum_grad'] = torch.clone(d_p).detach() + param_state['cum_grad'].mul_(local_lr) + else: + param_state['cum_grad'].add_(d_p, alpha=local_lr) + + p.data.add_(d_p, alpha=-local_lr) + + # compute local normalizing vector a_i ... but it's a scalar? + # should't a_i be applied to cum_grad? + # so this must be the l1 norm? -> this seems correct. a_i is not computed directly, only it's l1 norm + if self.momentum != 0: + self.local_counter = self.local_counter * self.momentum + 1 + self.ai_l1_norm += self.local_counter + + self.etamu = local_lr * self.mu + if self.etamu != 0: + self.ai_l1_norm *= (1 - self.etamu) + self.ai_l1_norm += 1 + + if self.momentum == 0 and self.etamu == 0: + self.ai_l1_norm += 1 + + self.local_steps += 1 + + return loss + + def set_tau_eff(self, tau_eff): + self.tau_eff = tau_eff + + def pre_communicate(self): + for group in self.param_groups: + for p in group['params']: + param_state = self.state[p] + + # apply fednova update rule + # learning rate has already been applied + cum_grad = param_state['cum_grad'] + p.data.sub_(cum_grad) # get back to old_init + p.data.add_(cum_grad, alpha=self.tau_eff/self.ai_l1_norm) # rescale changes + + # delete stuff for next round + del param_state['old_init'] + param_state['cum_grad'].zero_() + if 'momentum_buffer' in param_state: + param_state['momentum_buffer'].zero_() + + self.local_counter = 0 + self.ai_l1_norm = 0 + self.local_steps = 0 diff --git a/fltk/strategy/optimization/__init__.py b/fltk/strategy/optimization/__init__.py new file mode 100644 index 00000000..a38c3de0 --- /dev/null +++ b/fltk/strategy/optimization/__init__.py @@ -0,0 +1,13 @@ +import torch +from .fedprox import FedProx +from .FedNova import FedNova +from fltk.util.definitions import Optimizations + + +def get_optimizer(name: Optimizations): + optimizers = { + Optimizations.sgd: torch.optim.SGD, + Optimizations.fedprox: FedProx, + Optimizations.fednova: FedNova + } + return optimizers[name] diff --git a/fltk/strategy/optimization/fedprox.py b/fltk/strategy/optimization/fedprox.py new file mode 100644 index 00000000..7d0d5fe4 --- /dev/null +++ b/fltk/strategy/optimization/fedprox.py @@ -0,0 +1,147 @@ +import torch +from torch.optim.optimizer import Optimizer, required + + +class FedProx(Optimizer): + r"""Implements FedAvg and FedProx. Local Solver can have momentum. + + Nesterov momentum is based on the formula from + `On the importance of initialization and momentum in deep learning`__. + + Args: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + ratio (float): relative sample size of client + gmf (float): global/server/slow momentum factor + mu (float): parameter for proximal local SGD + lr (float): learning rate + momentum (float, optional): momentum factor (default: 0) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + dampening (float, optional): dampening for momentum (default: 0) + nesterov (bool, optional): enables Nesterov momentum (default: False) + + Example: + >>> optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9) + >>> optimizer.zero_grad() + >>> loss_fn(model(input), target).backward() + >>> optimizer.step() + + __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf + + .. note:: + The implementation of SGD with Momentum/Nesterov subtly differs from + Sutskever et. al. and implementations in some other frameworks. + + Considering the specific case of Momentum, the update can be written as + + .. math:: + v = \rho * v + g \\ + p = p - lr * v + + where p, g, v and :math:`\rho` denote the parameters, gradient, + velocity, and momentum respectively. + + This is in contrast to Sutskever et. al. and + other frameworks which employ an update of the form + + .. math:: + v = \rho * v + lr * g \\ + p = p - v + + The Nesterov version is analogously modified. + """ + + def __init__(self, params, lr=0.05, momentum=0.9, dampening=0, + weight_decay=0, nesterov=False, variance=0, mu=0.01): + + self.itr = 0 + self.a_sum = 0 + self.mu = mu + self.loss = None + + if lr is not required and lr < 0.0: + raise ValueError("Invalid learning rate: {}".format(lr)) + if momentum < 0.0: + raise ValueError("Invalid momentum value: {}".format(momentum)) + if weight_decay < 0.0: + raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) + + defaults = dict(lr=lr, momentum=momentum, dampening=dampening, + weight_decay=weight_decay, nesterov=nesterov, variance=variance) + if nesterov and (momentum <= 0 or dampening != 0): + raise ValueError("Nesterov momentum requires a momentum and zero dampening") + super(FedProx, self).__init__(params, defaults) + + + def __setstate__(self, state): + super(FedProx, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('nesterov', False) + + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + weight_decay = group['weight_decay'] + momentum = group['momentum'] + dampening = group['dampening'] + nesterov = group['nesterov'] + + + for p in group['params']: + if p.grad is None: + continue + d_p = p.grad.data + + if weight_decay != 0: + d_p.add_(p.data, alpha=weight_decay) + + param_state = self.state[p] + if 'old_init' not in param_state: + param_state['old_init'] = torch.clone(p.data).detach() + + if momentum != 0: + if 'momentum_buffer' not in param_state: + buf = param_state['momentum_buffer'] = torch.clone(d_p).detach() + else: + buf = param_state['momentum_buffer'] + buf.mul_(momentum).add_(d_p, alpha=1 - dampening) + if nesterov: + d_p = d_p.add(buf, alpha=1 - dampening) + else: + d_p = buf + + # apply proximal update + d_p.add_(p.data - param_state['old_init'], alpha=self.mu) + + p.data.add_(d_p, alpha=-group['lr']) + + # one simple heuristic is to increase μ when seeing + # the loss increasing and decreasing μ when seeing the loss decreasing + if self.loss: + ratio = loss/self.loss # if the new loss is greater, ratio > 1 + self.mu = self.mu*ratio + self.mu = min(1.0, self.mu) + self.mu = max(0.001, self.mu) + self.loss = loss + + + return loss + + def pre_communicate(self): + for group in self.param_groups: + for p in group['params']: + param_state = self.state[p] + if 'old_init' in param_state: + del param_state['old_init'] + if 'momentum_buffer' in param_state: + param_state['momentum_buffer'].zero_() diff --git a/fltk/util/analysis.py b/fltk/util/analysis.py new file mode 100644 index 00000000..1fca083f --- /dev/null +++ b/fltk/util/analysis.py @@ -0,0 +1,114 @@ +from pathlib import Path +import argparse +from typing import List + +import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sns +import re + +from matplotlib.lines import Line2D + + +def get_cwd() -> Path: + return Path.cwd() + + +def get_exp_name(path: Path) -> str: + return path.parent.name + + +def ensure_path_exists(path: Path): + path.mkdir(parents=True, exist_ok=True) + +def load_and_merge_dfs(files: List[Path]) -> pd.DataFrame: + dfs = [pd.read_csv(x) for x in files] + return pd.concat(dfs, ignore_index=True) + +def order_client_names(names: List[str]) -> List[str]: + return sorted(names, key=lambda x: float(re.findall(r'\d+', x)[0])) + +def plot_client_duration(df: pd.DataFrame): + small_df = df[['round_id', 'train_duration', 'test_duration', 'round_duration', 'node_name']].melt(id_vars=['round_id', 'node_name'], var_name='type') + ordered_clients = order_client_names(small_df['node_name'].unique()) + plt.figure() + g = sns.FacetGrid(small_df, col="type", sharey=False) + g.map(sns.boxplot, "node_name", "value", order=ordered_clients) + for axes in g.axes.flat: + _ = axes.set_xticklabels(axes.get_xticklabels(), rotation=90) + plt.tight_layout() + plt.show() + + plt.figure() + g = sns.FacetGrid(small_df, col="type", sharey=False, hue='node_name', hue_order=ordered_clients) + g.map(sns.lineplot, "round_id", "value") + for axes in g.axes.flat: + _ = axes.set_xticklabels(axes.get_xticklabels(), rotation=90) + plt.tight_layout() + plt.show() + +def plot_federator_accuracy(df: pd.DataFrame): + plt.figure() + g = sns.lineplot(data=df, x='round_id', y='test_accuracy') + # df.plot(x="date", y="column2", ax=ax2, legend=False, color="r") + sns.lineplot(ax=g.axes.twinx(), data=df, x='round_id', y='test_loss', color='r') + plt.title('Federator test accuracy') + g.legend(handles=[Line2D([], [], marker='_', color="r", label='test_loss'), + Line2D([], [], marker='_', color="b", label='test_accuracy')]) + plt.tight_layout() + plt.show() + +def plot_clients_accuracy(df: pd.DataFrame): + plt.figure() + g = sns.lineplot(data=df, x='round_id', y='accuracy', hue='node_name') + plt.title('Client test accuracy') + plt.tight_layout() + plt.show() + + +def load_replication(path: Path, replication_id: int): + all_files = [x for x in path.iterdir() if x.is_file()] + federator_files = [x for x in all_files if 'federator' in x.name] + client_files = [x for x in all_files if x.name.startswith('client')] + + federator_data = load_and_merge_dfs(federator_files) + federator_data['replication'] = replication_id + client_data = load_and_merge_dfs(client_files) + client_data['replication'] = replication_id + return federator_data, client_data + +def analyse(path: Path): + # cwd = get_cwd() + # output_path = cwd / get_exp_name(path) + # ensure_path_exists(output_path) + replications = [x for x in path.iterdir() if x.is_dir()] + print(replications) + client_dfs = [] + federator_dfs = [] + for replication_path in replications: + replication_id = int(replication_path.name.split('_')[-1][1:]) + federator_data, client_data = load_replication(replication_path, replication_id) + client_dfs.append(client_data) + federator_dfs.append(federator_data) + + federator_df = pd.concat(federator_dfs, ignore_index=True) + client_df = pd.concat(client_dfs, ignore_index=True) + # all_files = [x for x in path.iterdir() if x.is_file()] + # federator_files = [x for x in all_files if 'federator' in x.name] + # client_files = [x for x in all_files if x.name.startswith('client')] + # + # federator_data = load_and_merge_dfs(federator_files) + # client_data = load_and_merge_dfs(client_files) + # + # # print(len(client_data), len(federator_data)) + plot_client_duration(client_df) + plot_federator_accuracy(federator_df) + plot_clients_accuracy(client_df) + # # What do we want to plot in terms of data? + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Basic experiment analysis') + parser.add_argument('path', type=str, help='Path pointing to experiment results files') + args = parser.parse_args() + analyse(Path(args.path)) diff --git a/fltk/util/analyze.py b/fltk/util/analyze.py new file mode 100644 index 00000000..0784978b --- /dev/null +++ b/fltk/util/analyze.py @@ -0,0 +1,44 @@ +import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sns + + + + +if __name__ == '__main__': + exp_name = 'output/exp_p3_w4_s4_deadline' + + general_file = f'{exp_name}-general_data.csv' + print(f'Loading data file: {general_file}') + df = pd.read_csv(general_file) + print(df) + + + plt.figure() + sns.pointplot(data=df, x='epoch', y='accuracy') + plt.title('Accuracy per epoch') + plt.show() + + plt.figure() + # sns.pointplot(data=df[df['epoch'] > 1], x='epoch', y='duration') + sns.pointplot(data=df, x='epoch', y='duration') + plt.title('Train time per epoch') + plt.show() + + dfs = [] + for file in [f'{exp_name}_client1_epochs.csv', f'{exp_name}_client2_epochs.csv', f'{exp_name}_client3_epochs.csv', f'{exp_name}_client4_epochs.csv']: + dfs.append(pd.read_csv(file)) + client_df = pd.concat(dfs, ignore_index=True) + + print('Loading client data') + plt.figure() + # sns.pointplot(data=client_df[client_df['epoch_id'] > 1], x='epoch_id', y='duration_train', hue='client_id') + sns.pointplot(data=client_df, x='epoch_id', y='duration_train', hue='client_id') + plt.title('Train time per epoch clients') + plt.show() + + plt.figure() + sns.pointplot(data=client_df, x='epoch_id', y='accuracy', hue='client_id') + plt.title('Accuracy per epoch clients') + plt.show() + diff --git a/fltk/util/base_config.py b/fltk/util/base_config.py new file mode 100644 index 00000000..3d0a9382 --- /dev/null +++ b/fltk/util/base_config.py @@ -0,0 +1,425 @@ +from datetime import datetime + +import torch +import json + +from fltk.datasets.distributed import DistCIFAR10Dataset, DistCIFAR100Dataset, DistFashionMNISTDataset +from fltk.datasets.distributed.mnist import DistMNISTDataset +from fltk.nets import Cifar10CNN, FashionMNISTCNN, Cifar100ResNet, FashionMNISTResNet, Cifar10ResNet, Cifar100VGG +from fltk.nets.mnist_cnn import MNIST_CNN +from fltk.strategy.optimization import FedProx, FedNova +from fltk.util.definitions import Optimizations, DataSampler, Nets, Dataset + +SEED = 1 +torch.manual_seed(SEED) + + +class BareConfig: + + def __init__(self): + # self.logger = logger + + self.batch_size = 1 + self.test_batch_size = 1000 + self.epochs = 1 + # self.lr = 0.001 + self.lr = 0.0001 + # self.momentum = 0.9 + self.momentum = 0.1 + self.cuda = False + self.shuffle = False + self.log_interval = 10 + self.kwargs = {} + self.contribution_measurement_round = 1 + self.contribution_measurement_metric = 'Influence' + self.epochs_per_round = 1 + + self.scheduler_step_size = 50 + self.scheduler_gamma = 0.5 + self.min_lr = 1e-10 + + self.loss_function = torch.nn.CrossEntropyLoss + self.optimizer = torch.optim.SGD + + self.optimizers = { + Optimizations.sgd: torch.optim.SGD, + Optimizations.fedprox: FedProx, + Optimizations.fednova: FedNova + } + + self.optimizer_args = { + 'lr': self.lr, + 'momentum': self.momentum + } + + self.round_worker_selection_strategy = None + self.round_worker_selection_strategy_kwargs = None + + self.save_model = False + self.save_temp_model = False + self.save_epoch_interval = 1 + self.save_model_path = "models" + self.epoch_save_start_suffix = "start" + self.epoch_save_end_suffix = "end" + self.get_poison_effort = 'half' + self.num_workers = 50 + # self.num_poisoned_workers = 10 + + self.offload_strategy = 'vanilla' + self.profiling_size = 30 + self.deadline = 400 + self.first_deadline = 400 + self.deadline_threshold = 10 + self.warmup_round = False + + # FLTK options + self.node_groups = None + + # Termination policy data + self.termination_percentage = 1 + + self.federator_host = '0.0.0.0' + self.rank = 0 + self.world_size = 0 + self.data_sampler = DataSampler.uniform + self.data_sampler_args = None + self.distributed = False + + self.available_nets = { + Nets.cifar100_resnet: Cifar100ResNet, + Nets.cifar100_vgg: Cifar100VGG, + Nets.cifar10_cnn: Cifar10CNN, + Nets.cifar10_resnet: Cifar10ResNet, + Nets.fashion_mnist_cnn: FashionMNISTCNN, + Nets.fashion_mnist_resnet: FashionMNISTResNet, + Nets.mnist_cnn: MNIST_CNN, + + } + + self.nets_split_point = { + Nets.cifar100_resnet: 48, + Nets.cifar100_vgg: 28, + Nets.cifar10_cnn: 15, + Nets.cifar10_resnet: 39, + Nets.fashion_mnist_cnn: 7, + Nets.fashion_mnist_resnet: 7, + Nets.mnist_cnn: 2, + } + self.net = None + self.net_name = Nets.cifar10_cnn + self.set_net_by_name(self.net_name.value) + # self.dataset_name = 'cifar10' + self.dataset_name = Dataset.cifar10 + + self.DistDatasets = { + Dataset.cifar10: DistCIFAR10Dataset, + Dataset.cifar100: DistCIFAR100Dataset, + Dataset.fashion_mnist: DistFashionMNISTDataset, + Dataset.mnist: DistMNISTDataset + } + self.train_data_loader_pickle_path = { + Dataset.cifar10: 'data_loaders/cifar10/train_data_loader.pickle', + Dataset.fashion_mnist: 'data_loaders/fashion-mnist/train_data_loader.pickle', + Dataset.cifar100: 'data_loaders/cifar100/train_data_loader.pickle', + Dataset.mnist: 'data_loaders/mnist/train_data_loader.pickle', + } + + self.test_data_loader_pickle_path = { + Dataset.cifar10: 'data_loaders/cifar10/test_data_loader.pickle', + Dataset.fashion_mnist: 'data_loaders/fashion-mnist/test_data_loader.pickle', + Dataset.cifar100: 'data_loaders/cifar100/test_data_loader.pickle', + Dataset.mnist: 'data_loaders/mnist/test_data_loader.pickle', + + } + self.loss_function = torch.nn.CrossEntropyLoss + self.default_model_folder_path = "default_models" + self.data_path = "data" + + # For freezing effect experiment + self.freeze_clients = [] + + ########### + # Methods # + ########### + + def merge_yaml(self, cfg = {}): + """ + total_epochs: 20 + epochs_per_cycle: 2 + wait_for_clients: true + net: Cifar10CNN + dataset: cifar10 + experiment_prefix: 'experiment' + output_location: 'output' + tensor_board_active: true + :param yaml_config: + :return: + """ + if 'total_epochs' in cfg: + self.epochs = cfg['total_epochs'] + if 'epochs_per_cycle' in cfg: + self.epochs_per_cycle = cfg['epochs_per_cycle'] + if 'wait_for_clients' in cfg: + self.wait_for_clients = cfg['wait_for_clients'] + if 'net' in cfg: + self.net_name = Nets(cfg['net']) + self.set_net_by_name(cfg['net']) + if 'dataset' in cfg: + self.dataset_name = Dataset(cfg['dataset']) + if 'offload_stategy' in cfg: + self.offload_strategy = cfg['offload_stategy'] + if 'profiling_size' in cfg: + self.profiling_size = cfg['profiling_size'] + if 'deadline' in cfg: + self.deadline = cfg['deadline'] + if 'deadline_threshold' in cfg: + self.deadline_threshold = cfg['deadline_threshold'] + if 'first_deadline' in cfg: + self.first_deadline = cfg['first_deadline'] + if 'warmup_round' in cfg: + self.warmup_round = cfg['warmup_round'] + if 'experiment_prefix' in cfg: + self.experiment_prefix = cfg['experiment_prefix'] + else: + self.experiment_prefix = f'{datetime.now()}' + if 'output_location' in cfg: + self.output_location = cfg['output_location'] + if 'tensor_board_active' in cfg: + self.tensor_board_active = cfg['tensor_board_active'] + if 'clients_per_round' in cfg: + self.clients_per_round = cfg['clients_per_round'] + if 'system' in cfg: + if 'clients' in cfg['system']: + if 'amount' in cfg['system']['clients']: + self.world_size = cfg['system']['clients']['amount'] + 1 + + if 'system' in cfg: + if 'federator' in cfg['system']: + if 'hostname' in cfg['system']['federator']: + self.federator_host = cfg['system']['federator']['hostname'] + if 'cuda' in cfg: + if cfg['cuda']: + self.cuda = True + else: + self.cuda = False + if 'optimizer' in cfg: + self.optimizer = self.optimizers[Optimizations(cfg['optimizer'])] + if 'optimizer_args' in cfg: + for k, v in cfg['optimizer_args'].items(): + self.optimizer_args[k] = v + if 'sampler' in cfg: + self.data_sampler = DataSampler(cfg['sampler']) + if 'sampler_args' in cfg: + self.data_sampler_args = cfg['sampler_args'] + + if 'node_groups' in cfg: + self.node_groups = cfg['node_groups'] + if 'termination_percentage' in cfg: + self.termination_percentage = cfg['termination_percentage'] + + if 'epochs_per_round' in cfg: + self.epochs_per_round = cfg['epochs_per_round'] + if 'freeze_clients' in cfg: + self.freeze_clients = cfg['freeze_clients'] + + + + def init_logger(self, logger): + self.logger = logger + + def get_distributed(self): + return self.distributed + + def get_rank(self): + return self.rank + + def get_world_size(self): + return self.world_size + + def set_sampler(self, sampler): + self.data_sampler = sampler + + def get_sampler(self): + return self.data_sampler + + def get_optimizer(self): + return self.optimizer + + def get_sampler_args(self): + return tuple(self.data_sampler_args) + + def get_round_worker_selection_strategy(self): + return self.round_worker_selection_strategy + + def get_round_worker_selection_strategy_kwargs(self): + return self.round_worker_selection_strategy_kwargs + + def set_round_worker_selection_strategy_kwargs(self, kwargs): + self.round_worker_selection_strategy_kwargs = kwargs + + def set_client_selection_strategy(self, strategy): + self.round_worker_selection_strategy = strategy + + def get_data_path(self): + return self.data_path + + def get_epoch_save_start_suffix(self): + return self.epoch_save_start_suffix + + def get_epoch_save_end_suffix(self): + return self.epoch_save_end_suffix + + def get_dataloader_list(self): + return list(self.train_data_loader_pickle_path.keys()) + + def get_nets_list(self): + return list(map(lambda c: c.value, Nets)) + + def set_train_data_loader_pickle_path(self, path, name='cifar10'): + self.train_data_loader_pickle_path[name] = path + + def get_train_data_loader_pickle_path(self): + return self.train_data_loader_pickle_path[self.dataset_name] + + def set_test_data_loader_pickle_path(self, path, name='cifar10'): + self.test_data_loader_pickle_path[name] = path + + def get_test_data_loader_pickle_path(self): + return self.test_data_loader_pickle_path[self.dataset_name] + + def set_net_by_name(self, name: str): + self.net_name = Nets(name) + self.net = self.available_nets[self.net_name] + + def get_cuda(self): + return self.cuda + + def get_scheduler_step_size(self): + return self.scheduler_step_size + + def get_scheduler_gamma(self): + return self.scheduler_gamma + + def get_min_lr(self): + return self.min_lr + + def get_default_model_folder_path(self): + return self.default_model_folder_path + + def get_num_epochs(self): + return self.epochs + + def set_num_poisoned_workers(self, num_poisoned_workers): + self.num_poisoned_workers = num_poisoned_workers + + def set_num_workers(self, num_workers): + self.num_workers = num_workers + + def set_model_save_path(self, save_model_path): + self.save_model_path = save_model_path + + def get_logger(self): + return self.logger + + def get_loss_function(self): + return self.loss_function + + def get_net(self): + return self.net + + def get_num_workers(self): + return self.num_workers + + def get_num_poisoned_workers(self): + return self.num_poisoned_workers + + def get_poison_effort(self): + return self.get_poison_effort + + def get_learning_rate(self): + return self.lr + + def get_momentum(self): + return self.momentum + + def get_shuffle(self): + return self.shuffle + + def get_batch_size(self): + return self.batch_size + + def get_test_batch_size(self): + return self.test_batch_size + + def get_log_interval(self): + return self.log_interval + + def get_save_model_folder_path(self): + return self.save_model_path + + def get_learning_rate_from_epoch(self, epoch_idx): + lr = self.lr * (self.scheduler_gamma ** int(epoch_idx / self.scheduler_step_size)) + + if lr < self.min_lr: + self.logger.warning("Updating LR would place it below min LR. Skipping LR update.") + + return self.min_lr + + self.logger.debug("LR: {}".format(lr)) + + return lr + + def get_contribution_measurement_round(self): + return self.contribution_measurement_round + + def get_contribution_measurement_metric(self): + return self.contribution_measurement_metric + + def should_save_model(self, epoch_idx): + """ + Returns true/false models should be saved. + + :param epoch_idx: current training epoch index + :type epoch_idx: int + """ + if not self.save_model: + return False + + if epoch_idx == 1 or epoch_idx % self.save_epoch_interval == 0: + return True + + def log(self): + """ + Log this arguments object to the logger. + """ + self.logger.debug("Arguments: {}", str(self)) + + def __str__(self): + return "\nBatch Size: {}\n".format(self.batch_size) + \ + "Test Batch Size: {}\n".format(self.test_batch_size) + \ + "Epochs: {}\n".format(self.epochs) + \ + "Learning Rate: {}\n".format(self.lr) + \ + "Momentum: {}\n".format(self.momentum) + \ + "CUDA Enabled: {}\n".format(self.cuda) + \ + "Shuffle Enabled: {}\n".format(self.shuffle) + \ + "Log Interval: {}\n".format(self.log_interval) + \ + "Scheduler Step Size: {}\n".format(self.scheduler_step_size) + \ + "Scheduler Gamma: {}\n".format(self.scheduler_gamma) + \ + "Scheduler Minimum Learning Rate: {}\n".format(self.min_lr) + \ + "Client Selection Strategy: {}\n".format(self.round_worker_selection_strategy) + \ + "Client Selection Strategy Arguments: {}\n".format( + json.dumps(self.round_worker_selection_strategy_kwargs, indent=4, sort_keys=True)) + \ + "Model Saving Enabled: {}\n".format(self.save_model) + \ + "Model Saving Interval: {}\n".format(self.save_epoch_interval) + \ + "Model Saving Path (Relative): {}\n".format(self.save_model_path) + \ + "Epoch Save Start Prefix: {}\n".format(self.epoch_save_start_suffix) + \ + "Epoch Save End Suffix: {}\n".format(self.epoch_save_end_suffix) + \ + "Number of Clients: {}\n".format(self.num_workers) + \ + "Number of Poisoned Clients: {}\n".format(self.num_poisoned_workers) + \ + "NN: {}\n".format(self.net) + \ + "Train Data Loader Path: {}\n".format(self.train_data_loader_pickle_path) + \ + "Test Data Loader Path: {}\n".format(self.test_data_loader_pickle_path) + \ + "Loss Function: {}\n".format(self.loss_function) + \ + "Default Model Folder Path: {}\n".format(self.default_model_folder_path) + \ + "Data Path: {}\n".format(self.data_path) + \ + "Dataset Name: {}\n".format(self.dataset_name) \ No newline at end of file diff --git a/fltk/util/cluster/client.py b/fltk/util/cluster/client.py index ce2bf0ce..7c536442 100644 --- a/fltk/util/cluster/client.py +++ b/fltk/util/cluster/client.py @@ -13,7 +13,7 @@ V1VolumeMount, V1Toleration, V1Volume, V1PersistentVolumeClaimVolumeSource from fltk.util.cluster.conversion import Convert -from fltk.util.config import BareConfig +from fltk.util.config import DistributedConfig from fltk.util.singleton import Singleton from fltk.util.task.task import ArrivalTask @@ -221,7 +221,7 @@ def build_resources(self, arrival_task: ArrivalTask) -> None: self._buildDescription.resources = client.V1ResourceRequirements(requests=req_dict, limits=req_dict) - def _generate_command(self, config: BareConfig, task: ArrivalTask): + def _generate_command(self, config: DistributedConfig, task: ArrivalTask): command = (f'python3 -m fltk client {config.config_path} {task.id} ' f'--model {task.network} --dataset {task.dataset} ' f'--optimizer Adam --max_epoch {task.param_conf.max_epoch} ' @@ -230,7 +230,7 @@ def _generate_command(self, config: BareConfig, task: ArrivalTask): f'--backend gloo') return command.split(' ') - def _build_container(self, conf: BareConfig, task: ArrivalTask, name: str = "pytorch", + def _build_container(self, conf: DistributedConfig, task: ArrivalTask, name: str = "pytorch", vol_mnts: List[V1VolumeMount] = None) -> V1Container: return V1Container( name=name, @@ -242,10 +242,10 @@ def _build_container(self, conf: BareConfig, task: ArrivalTask, name: str = "pyt volume_mounts=vol_mnts ) - def build_worker_container(self, conf: BareConfig, task: ArrivalTask, name: str = "pytorch") -> None: + def build_worker_container(self, conf: DistributedConfig, task: ArrivalTask, name: str = "pytorch") -> None: self._buildDescription.worker_container = self._build_container(conf, task, name) - def build_master_container(self, conf: BareConfig, task: ArrivalTask, name: str = "pytorch") -> None: + def build_master_container(self, conf: DistributedConfig, task: ArrivalTask, name: str = "pytorch") -> None: """ Function to build the Master worker container. This requires the LOG PV to be mounted on the expected logging directory. Make sure that any changes in the Helm charts are also reflected here. @@ -263,7 +263,7 @@ def build_master_container(self, conf: BareConfig, task: ArrivalTask, name: str )] self._buildDescription.master_container = self._build_container(conf, task, name, master_mounts) - def build_container(self, task: ArrivalTask, conf: BareConfig): + def build_container(self, task: ArrivalTask, conf: DistributedConfig): self.build_master_container(conf, task) self.build_worker_container(conf, task) @@ -342,12 +342,12 @@ def create_identifier(self, task: ArrivalTask): self._buildDescription.id = task.id -def construct_job(conf: BareConfig, task: ArrivalTask) -> V1PyTorchJob: +def construct_job(conf: DistributedConfig, task: ArrivalTask) -> V1PyTorchJob: """ Function to build a Job, based on the specifications of an ArrivalTask, and the general configuration of the BareConfig. @param conf: configuration object that contains specifics to properly start a client. - @type conf: BareConfig + @type conf: DistributedConfig @param task: Learning task for which a job description must be made. @type task: ArrivalTask @return: KubeFlow compatible PyTorchJob description to create a Job with the requested system and hyper parameters. @@ -361,5 +361,6 @@ def construct_job(conf: BareConfig, task: ArrivalTask) -> V1PyTorchJob: dp_builder.build_template() dp_builder.build_spec(task) job = dp_builder.construct() + # Fix to deploy on more up-to-date Kubernetes clusters. job.openapi_types = job.swagger_types return job diff --git a/fltk/util/config/__init__.py b/fltk/util/config/__init__.py index bdcc8f70..08739bf1 100644 --- a/fltk/util/config/__init__.py +++ b/fltk/util/config/__init__.py @@ -1 +1,2 @@ -from .base_config import * +from .distributed_config import * +from .config import Config \ No newline at end of file diff --git a/fltk/util/config/arguments.py b/fltk/util/config/arguments.py index 79ce2490..2f128bdb 100644 --- a/fltk/util/config/arguments.py +++ b/fltk/util/config/arguments.py @@ -56,7 +56,7 @@ class LearningParameters: } _available_optimizer = { - "ADAM": torch.optim.SGD + "ADAM": torch.optim.Adam } @staticmethod @@ -110,7 +110,7 @@ def get_optimizer(self) -> Type[torch.optim.Optimizer]: def extract_learning_parameters(args: Namespace) -> LearningParameters: """ - Function to extract the learning hyper-parameters from the Namespace object for the passed arguments. + Function to extract the learning hyperparameters from the Namespace object for the passed arguments. @param args: Namespace environment for running the Client. @type args: Namespace @return: Parsed learning parameters. @@ -151,3 +151,49 @@ def create_cluster_parser(subparsers) -> None: cluster_parser = subparsers.add_parser('cluster') cluster_parser.add_argument('config', type=str) cluster_parser.add_argument('-l', '--local', type=bool, default=False) + + +def create_container_util_parser(subparsers) -> None: + util_docker_parser = subparsers.add_parser('util-docker') + util_docker_parser.add_argument('name', type=str) + util_docker_parser.add_argument('--clients', type=int) + + +def create_util_parser(subparsers): + util_generate_parser = subparsers.add_parser('util-generate') + util_generate_parser.add_argument('path', type=str) + + +def create_util_run_parser(subparsers) -> None: + util_run_parser = subparsers.add_parser('util-run') + util_run_parser.add_argument('path', type=str) + + +def create_remote_parser(subparsers) -> None: + remote_parser = subparsers.add_parser('remote') + remote_parser.add_argument('rank', type=int) + remote_parser.add_argument('--nic', type=str, default=None) + remote_parser.add_argument('--host', type=str, default=None) + add_default_arguments(remote_parser) + + +def create_single_parser(subparsers) -> None: + single_machine_parser = subparsers.add_parser('single') + add_default_arguments(single_machine_parser) + + +def add_default_arguments(*parsers): + for parser in parsers: + parser.add_argument('config', type=str, help='') + parser.add_argument('--prefix', type=str, default=None) + + +def create_all_subparsers(subparsers): + create_extractor_parser(subparsers) + create_client_parser(subparsers) + create_cluster_parser(subparsers) + create_container_util_parser(subparsers) + create_util_parser(subparsers) + create_util_run_parser(subparsers) + create_remote_parser(subparsers) + create_single_parser(subparsers) diff --git a/fltk/util/config/config.py b/fltk/util/config/config.py new file mode 100644 index 00000000..c83356db --- /dev/null +++ b/fltk/util/config/config.py @@ -0,0 +1,120 @@ +import copy +from dataclasses import dataclass +from enum import Enum, EnumMeta +from pathlib import Path +from typing import Type + +import torch +import yaml +from fltk.util.log import getLogger + +from fltk.util.definitions import Dataset, Nets, DataSampler, Optimizations, LogLevel, Aggregations + + +@dataclass +class Config: + # optimizer: Optimizations + batch_size: int = 1 + test_batch_size: int = 1000 + rounds: int = 2 + epochs: int = 1 + lr: float = 0.01 + momentum: float = 0.1 + cuda: bool = False + shuffle: bool = False + log_interval: int = 10 + scheduler_step_size: int = 50 + scheduler_gamma: float = 0.5 + min_lr: float = 1e-10 + + # @TODO: Set seed from configuration + rng_seed = 0 + # Enum + optimizer: Optimizations = Optimizations.sgd + optimizer_args = { + 'lr': lr, + 'momentum': momentum + } + loss_function = torch.nn.CrossEntropyLoss + # Enum + log_level: LogLevel = LogLevel.DEBUG + + num_clients: int = 10 + clients_per_round: int = 2 + distributed: bool = True + single_machine: bool = False + # Enum + aggregation: Aggregations = Aggregations.fedavg + # Enum + dataset_name: Dataset = Dataset.mnist + # Enum + net_name: Nets = Nets.mnist_cnn + default_model_folder_path: str = "default_models" + data_path: str = "data" + # Enum + data_sampler: DataSampler = DataSampler.uniform + data_sampler_args = [] + + rank: int = 0 + world_size: int = 0 + + replication_id: int = None + experiment_prefix: str = '' + + real_time : bool = False + + # Save data in append mode. Thereby flushing on every append to file. + # This could be useful when a system is likely to crash midway an experiment + save_data_append: bool = False + output_path: Path = Path('output_test_2') + + def __init__(self, **kwargs) -> None: + enum_fields = [x for x in self.__dataclass_fields__.items() if isinstance(x[1].type, Enum) or isinstance(x[1].type, EnumMeta)] + if 'dataset' in kwargs and 'dataset_name' not in kwargs: + kwargs['dataset_name'] = kwargs['dataset'] + if 'net' in kwargs and 'net_name' not in kwargs: + kwargs['net_name'] = kwargs['net'] + for name, field in enum_fields: + if name in kwargs and isinstance(kwargs[name], str): + kwargs[name] = field.type(kwargs[name]) + for name, value in kwargs.items(): + self.__setattr__(name, value) + if name == 'output_location': + self.output_path = Path(value) + self.update_rng_seed() + + + def update_rng_seed(self): + torch.manual_seed(self.rng_seed) + def get_default_model_folder_path(self): + return self.default_model_folder_path + + def get_distributed(self): + return self.distributed + + def get_sampler(self): + return self.data_sampler + + def get_world_size(self): + return self.world_size + + def get_rank(self): + return self.rank + + def get_sampler_args(self): + return tuple(self.data_sampler_args) + + def get_data_path(self): + return self.data_path + + def get_loss_function(self): + return self.loss_function + + @classmethod + def FromYamlFile(cls, path: Path): + getLogger(__name__).debug(f'Loading yaml from {path.absolute()}') + with open(path) as file: + content = yaml.safe_load(file) + for k, v in content.items(): + getLogger(__name__).debug(f'Inserting key "{k}" into config') + return cls(**content) diff --git a/fltk/util/config/base_config.py b/fltk/util/config/distributed_config.py similarity index 99% rename from fltk/util/config/base_config.py rename to fltk/util/config/distributed_config.py index 25349406..4f089a8d 100644 --- a/fltk/util/config/base_config.py +++ b/fltk/util/config/distributed_config.py @@ -114,7 +114,7 @@ def load_incluster_image(self): @dataclass_json @dataclass -class BareConfig(object): +class DistributedConfig(object): execution_config: ExecutionConfig cluster_config: ClusterConfig = field(metadata=config(field_name="cluster")) config_path: Path = None diff --git a/fltk/util/data_container.py b/fltk/util/data_container.py new file mode 100644 index 00000000..00cee9c1 --- /dev/null +++ b/fltk/util/data_container.py @@ -0,0 +1,88 @@ +import abc +import csv +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Union, List, Type, TextIO + + +@dataclass +class DataRecord(abc.ABC): + pass + + +@dataclass +class FederatorRecord(DataRecord): + num_selected_clients: int + round_id: int + round_duration: int + test_loss: float + test_accuracy: float + # Accuracy per class? + timestamp: float = time.time() + node_name: str = '' + + +@dataclass +class ClientRecord(DataRecord): + round_id: int + train_duration: float + test_duration: float + round_duration: float + num_epochs: int + trained_items: int + accuracy: float + train_loss: float + test_loss: float + # Accuracy per class? + timestamp: float = time.time() + node_name: str = '' + + +class DataContainer: + records: List[DataRecord] + file_name: str + file_handle: TextIO + file_path: Path + append_mode: bool + record_type: Type[DataRecord] + delimiter = ',' + name: str + + def __init__(self, name: str, output_location: Path, record_type: Type[DataRecord], append_mode: bool = False): + # print(f'Creating new Data container for client {name}') + self.records: List[record_type] = list() + self.file_name = f'{name}.csv' + self.name = name + output_location = Path(output_location) + output_location.mkdir(parents=True, exist_ok=True) + self.file_path = output_location / self.file_name + self.append_mode = append_mode + file_flag = 'a' if append_mode else 'w' + self.file_handle = open(self.file_path, file_flag) + print(f'[<=========>] Creating data container at {self.file_path}') + self.record_type = record_type + if self.append_mode: + open(self.file_path, 'w').close() + dw = csv.DictWriter(self.file_handle, self.record_type.__annotations__) + dw.writeheader() + self.file_handle.flush() + + def append(self, record: DataRecord): + record.node_name = self.name + self.records.append(record) + if self.append_mode: + dw = csv.DictWriter(self.file_handle, self.record_type.__annotations__) + dw.writerow(record.__dict__) + self.file_handle.flush() + + def save(self): + if self.append_mode: + return + dw = csv.DictWriter(self.file_handle, self.record_type.__annotations__) + dw.writeheader() + # print(f'Saving {len(self.records)} for node {self.name}') + for record in self.records: + record.node_name = self.name + dw.writerow(record.__dict__) + self.file_handle.flush() diff --git a/fltk/util/definitions.py b/fltk/util/definitions.py new file mode 100644 index 00000000..2492b062 --- /dev/null +++ b/fltk/util/definitions.py @@ -0,0 +1,64 @@ +######### Definitions ######### +# 1. Datasets # +# 2. Networks (models) # +# 3. Aggregation methods # +# 4. Client selection methods # +# 5. Data samplers # +# 6. Optimizers # +############################### +# Use enums instead of dataclasses? +from enum import Enum, unique + + +@unique +class DataSampler(Enum): + uniform = "uniform" + q_sampler = "q sampler" + limit_labels = "limit labels" + dirichlet = "dirichlet" + limit_labels_q = "limit labels q" + emd_sampler = 'emd sampler' + limit_labels_flex = "limit labels flex" + n_labels = "n labels" + + +@unique +class Optimizations(Enum): + sgd = 'SGD' + fedprox = 'FedProx' + fednova = 'FedNova' + + +@unique +class Dataset(Enum): + cifar10 = 'cifar10' + cifar100 = 'cifar100' + fashion_mnist = 'fashion-mnist' + mnist = 'mnist' + +class LogLevel(Enum): + CRITICAL = 50 + FATAL = CRITICAL + ERROR = 40 + WARNING = 30 + WARN = WARNING + INFO = 20 + DEBUG = 10 + NOTSET = 0 + +@unique +class Aggregations(Enum): + avg = 'Avg' + fedavg = 'FedAvg' + sum = 'Sum' + + +@unique +class Nets(Enum): + cifar100_resnet = "Cifar100ResNet" + cifar100_vgg = "Cifar100VGG" + cifar10_cnn = "Cifar10CNN" + cifar10_resnet = "Cifar10ResNet" + fashion_mnist_cnn = "FashionMNISTCNN" + fashion_mnist_resnet = "FashionMNISTResNet" + mnist_cnn = 'MNISTCNN' diff --git a/fltk/util/generate_data_distribution.py b/fltk/util/generate_data_distribution.py new file mode 100644 index 00000000..047aa202 --- /dev/null +++ b/fltk/util/generate_data_distribution.py @@ -0,0 +1,71 @@ +import pathlib +import os +import logging + +from fltk.datasets.distributed import DistCIFAR10Dataset, DistCIFAR100Dataset, DistFashionMNISTDataset +# from fltk.datasets import CIFAR10Dataset, FashionMNISTDataset, CIFAR100Dataset +from fltk.util.arguments import Arguments +from fltk.util.data_loader_utils import generate_train_loader, generate_test_loader, save_data_loader_to_file + +logging.basicConfig(level=logging.DEBUG) + + +if __name__ == '__main__': + args = Arguments(logging) + + # --------------------------------- + # ------------ CIFAR10 ------------ + # --------------------------------- + dataset = DistCIFAR10Dataset(args) + TRAIN_DATA_LOADER_FILE_PATH = "data_loaders/cifar10/train_data_loader.pickle" + TEST_DATA_LOADER_FILE_PATH = "data_loaders/cifar10/test_data_loader.pickle" + + if not os.path.exists("data_loaders/cifar10"): + pathlib.Path("data_loaders/cifar10").mkdir(parents=True, exist_ok=True) + + train_data_loader = generate_train_loader(args, dataset) + test_data_loader = generate_test_loader(args, dataset) + + with open(TRAIN_DATA_LOADER_FILE_PATH, "wb") as f: + save_data_loader_to_file(train_data_loader, f) + + with open(TEST_DATA_LOADER_FILE_PATH, "wb") as f: + save_data_loader_to_file(test_data_loader, f) + + # --------------------------------- + # --------- Fashion-MNIST --------- + # --------------------------------- + dataset = DistFashionMNISTDataset(args) + TRAIN_DATA_LOADER_FILE_PATH = "data_loaders/fashion-mnist/train_data_loader.pickle" + TEST_DATA_LOADER_FILE_PATH = "data_loaders/fashion-mnist/test_data_loader.pickle" + + if not os.path.exists("data_loaders/fashion-mnist"): + pathlib.Path("data_loaders/fashion-mnist").mkdir(parents=True, exist_ok=True) + + train_data_loader = generate_train_loader(args, dataset) + test_data_loader = generate_test_loader(args, dataset) + + with open(TRAIN_DATA_LOADER_FILE_PATH, "wb") as f: + save_data_loader_to_file(train_data_loader, f) + + with open(TEST_DATA_LOADER_FILE_PATH, "wb") as f: + save_data_loader_to_file(test_data_loader, f) + + # --------------------------------- + # ------------ CIFAR100 ----------- + # --------------------------------- + dataset = DistCIFAR100Dataset(args) + TRAIN_DATA_LOADER_FILE_PATH = "data_loaders/cifar100/train_data_loader.pickle" + TEST_DATA_LOADER_FILE_PATH = "data_loaders/cifar100/test_data_loader.pickle" + + if not os.path.exists("data_loaders/cifar100"): + pathlib.Path("data_loaders/cifar100").mkdir(parents=True, exist_ok=True) + + train_data_loader = generate_train_loader(args, dataset) + test_data_loader = generate_test_loader(args, dataset) + + with open(TRAIN_DATA_LOADER_FILE_PATH, "wb") as f: + save_data_loader_to_file(train_data_loader, f) + + with open(TEST_DATA_LOADER_FILE_PATH, "wb") as f: + save_data_loader_to_file(test_data_loader, f) diff --git a/fltk/util/generate_docker_compose.py b/fltk/util/generate_docker_compose.py new file mode 100644 index 00000000..b58233d3 --- /dev/null +++ b/fltk/util/generate_docker_compose.py @@ -0,0 +1,574 @@ +import sys +from pathlib import Path + +import yaml +import copy +import argparse + +# global_template_path = './deploy/templates' + +global_template_path = Path(__file__).absolute().parent.parent.parent / 'deploy' / 'templates' +global_template_path = global_template_path.__str__() +print(global_template_path) +def load_system_template(template_path = global_template_path): + print(f'Loading system template from {template_path}/system_stub.yml') + with open(f'{template_path}/system_stub.yml') as file: + documents = yaml.full_load(file) + return documents + +def load_client_template(type='default', template_path = global_template_path): + with open(f'{template_path}/client_stub_{type}.yml') as file: + documents = yaml.full_load(file) + return documents + +def get_deploy_path(name: str): + return f'{Path(global_template_path).parent}/{name}' + + +def generate_client(id, template: dict, world_size: int, type='default', cpu_set=''): + local_template = copy.deepcopy(template) + key_name = list(local_template.keys())[0] + container_name = f'client_{type}_{id}' + local_template[container_name] = local_template.pop(key_name) + for key, item in enumerate(local_template[container_name]['environment']): + if item == 'RANK={rank}': + local_template[container_name]['environment'][key] = item.format(rank=id) + if item == 'WORLD_SIZE={world_size}': + local_template[container_name]['environment'][key] = item.format(world_size=world_size) + # for key, item in enumerate(local_template[container_name]): + # if item == 'cpuset: {cpu_set}': + # local_template[container_name][key] = item.format(cpu_set=cpu_set) + + local_template[container_name]['ports'] = [f'{5000+id}:5000'] + local_template[container_name]['cpuset'] = f'{cpu_set}' + return local_template, container_name + +def generate_compose_file(): + print() + + +def generate_p30_freezing_effect_dev(): + template_path = get_deploy_path('p28_non_iid_effect') + num_clients = 6 + cpu_per_client = 1 + num_cpus = 20 + world_size = num_clients + 1 + system_template: dict = load_system_template(template_path=template_path) + + for key, item in enumerate(system_template['services']['fl_server']['environment']): + if item == 'WORLD_SIZE={world_size}': + system_template['services']['fl_server']['environment'][key] = item.format(world_size=world_size) + cpu_set = 0 + cpu_idx = 2 + for client_id in range(1, num_clients + 1): + client_type = 'default' + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + + client_template: dict = load_client_template(type=client_type, template_path=template_path) + client_definition, container_name = generate_client(client_id, client_template, world_size, type=client_type, cpu_set=cpu_set) + system_template['services'].update(client_definition) + + with open(r'./docker-compose.yml', 'w') as file: + yaml.dump(system_template, file, sort_keys=False) + +def generate_p28_non_iid_effect(): + template_path = get_deploy_path('p28_non_iid_effect') + num_clients = 10 + cpu_per_client = 1 + num_cpus = 20 + world_size = num_clients + 1 + system_template: dict = load_system_template(template_path=template_path) + + for key, item in enumerate(system_template['services']['fl_server']['environment']): + if item == 'WORLD_SIZE={world_size}': + system_template['services']['fl_server']['environment'][key] = item.format(world_size=world_size) + cpu_set = 0 + cpu_idx = 2 + for client_id in range(1, num_clients + 1): + client_type = 'default' + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + + client_template: dict = load_client_template(type=client_type, template_path=template_path) + client_definition, container_name = generate_client(client_id, client_template, world_size, type=client_type, cpu_set=cpu_set) + system_template['services'].update(client_definition) + + with open(r'./docker-compose.yml', 'w') as file: + yaml.dump(system_template, file, sort_keys=False) + +def generate_p23_freezoff_w9s3(): + template_path = get_deploy_path('p23_freezoff_w9s3') + num_clients = 9 + cpu_per_client = 1 + num_cpus = 20 + world_size = num_clients + 1 + system_template: dict = load_system_template(template_path=template_path) + + for key, item in enumerate(system_template['services']['fl_server']['environment']): + if item == 'WORLD_SIZE={world_size}': + system_template['services']['fl_server']['environment'][key] = item.format(world_size=world_size) + cpu_set = 0 + cpu_idx = 2 + for client_id in range(1, num_clients + 1): + client_type = 'default' + if 0 < client_id <= 3: + client_type = 'slow' + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + elif 3 < client_id <= 6: + client_type = 'medium' + cpu_set = f'{cpu_idx}-{cpu_idx+1}' + cpu_idx += 2 + elif 6 < client_id <= 9: + client_type = 'fast' + cpu_set = f'{cpu_idx}-{cpu_idx + 2}' + cpu_idx += 3 + else: + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + + client_template: dict = load_client_template(type=client_type, template_path=template_path) + client_definition, container_name = generate_client(client_id, client_template, world_size, type=client_type, cpu_set=cpu_set) + system_template['services'].update(client_definition) + + with open(r'./docker-compose.yml', 'w') as file: + yaml.dump(system_template, file, sort_keys=False) + +def generate_p23_freezoff_w9s3_half(): + template_path = get_deploy_path('p23_freezoff_w9s3-half') + num_clients = 9 + cpu_per_client = 1 + num_cpus = 20 + world_size = num_clients + 1 + system_template: dict = load_system_template(template_path=template_path) + + for key, item in enumerate(system_template['services']['fl_server']['environment']): + if item == 'WORLD_SIZE={world_size}': + system_template['services']['fl_server']['environment'][key] = item.format(world_size=world_size) + cpu_set = 0 + cpu_idx = 2 + for client_id in range(1, num_clients + 1): + client_type = 'default' + if 0 < client_id <= 3: + client_type = 'slow' + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + elif 3 < client_id <= 6: + client_type = 'medium' + cpu_set = f'{cpu_idx}-{cpu_idx+1}' + cpu_idx += 2 + elif 6 < client_id <= 9: + client_type = 'fast' + cpu_set = f'{cpu_idx}-{cpu_idx + 2}' + cpu_idx += 3 + else: + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + + client_template: dict = load_client_template(type=client_type, template_path=template_path) + client_definition, container_name = generate_client(client_id, client_template, world_size, type=client_type, cpu_set=cpu_set) + system_template['services'].update(client_definition) + + with open(r'./docker-compose.yml', 'w') as file: + yaml.dump(system_template, file, sort_keys=False) + +def generate_p23_freezoff_w9s3_fast(): + template_path = get_deploy_path('p23_freezoff_w9s3_fast') + num_clients = 9 + cpu_per_client = 1 + num_cpus = 20 + world_size = num_clients + 1 + system_template: dict = load_system_template(template_path=template_path) + + for key, item in enumerate(system_template['services']['fl_server']['environment']): + if item == 'WORLD_SIZE={world_size}': + system_template['services']['fl_server']['environment'][key] = item.format(world_size=world_size) + cpu_set = 0 + cpu_idx = 2 + for client_id in range(1, num_clients + 1): + client_type = 'default' + if 0 < client_id <= 3: + client_type = 'slow' + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + elif 3 < client_id <= 6: + client_type = 'medium' + cpu_set = f'{cpu_idx}-{cpu_idx+1}' + cpu_idx += 2 + elif 6 < client_id <= 9: + client_type = 'fast' + cpu_set = f'{cpu_idx}-{cpu_idx + 2}' + cpu_idx += 3 + else: + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + + client_template: dict = load_client_template(type=client_type, template_path=template_path) + client_definition, container_name = generate_client(client_id, client_template, world_size, type=client_type, cpu_set=cpu_set) + system_template['services'].update(client_definition) + + with open(r'./docker-compose.yml', 'w') as file: + yaml.dump(system_template, file, sort_keys=False) + + + +def generate_terminate(num_clients = 16, medium=False): + template_path = get_deploy_path('terminate') + world_size = num_clients + 1 + system_template: dict = load_system_template(template_path=template_path) + + for key, item in enumerate(system_template['services']['fl_server']['environment']): + if item == 'WORLD_SIZE={world_size}': + system_template['services']['fl_server']['environment'][key] = item.format(world_size=world_size) + cpu_idx = 2 + for client_id in range(1, num_clients + 1): + if client_id < 5: + client_type = 'slow' + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + else: + client_type = 'medium' + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + client_template: dict = load_client_template(type=client_type, template_path=template_path) + client_definition, container_name = generate_client(client_id, client_template, world_size, type=client_type, + cpu_set=cpu_set) + system_template['services'].update(client_definition) + + with open(r'./docker-compose.yml', 'w') as file: + yaml.dump(system_template, file, sort_keys=False) + +def generate_dev(num_clients = 2, medium=False): + template_path = get_deploy_path('dev') + world_size = num_clients + 1 + system_template: dict = load_system_template(template_path=template_path) + + for key, item in enumerate(system_template['services']['fl_server']['environment']): + if item == 'WORLD_SIZE={world_size}': + system_template['services']['fl_server']['environment'][key] = item.format(world_size=world_size) + cpu_idx = 2 + for client_id in range(1, num_clients + 1): + if not medium: + client_type = 'fast' + cpu_set = f'{cpu_idx}-{cpu_idx + 2}' + cpu_idx += 3 + else: + client_type = 'medium' + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + client_template: dict = load_client_template(type=client_type, template_path=template_path) + client_definition, container_name = generate_client(client_id, client_template, world_size, type=client_type, + cpu_set=cpu_set) + system_template['services'].update(client_definition) + + with open(r'./docker-compose.yml', 'w') as file: + yaml.dump(system_template, file, sort_keys=False) + +def generate_p13_w6(): + template_path = get_deploy_path('p11_freezoff') + num_clients= 6 + world_size = num_clients + 1 + system_template: dict = load_system_template(template_path=template_path) + + for key, item in enumerate(system_template['services']['fl_server']['environment']): + if item == 'WORLD_SIZE={world_size}': + system_template['services']['fl_server']['environment'][key] = item.format(world_size=world_size) + cpu_idx = 2 + for client_id in range(1, num_clients + 1): + client_type = 'default' + if 0 < client_id <= 2: + client_type = 'slow' + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + elif 2 < client_id <= 4: + client_type = 'medium' + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + elif 4 < client_id <= 6: + client_type = 'fast' + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + else: + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + + client_template: dict = load_client_template(type=client_type, template_path=template_path) + client_definition, container_name = generate_client(client_id, client_template, world_size, type=client_type, + cpu_set=cpu_set) + system_template['services'].update(client_definition) + + with open(r'./docker-compose.yml', 'w') as file: + yaml.dump(system_template, file, sort_keys=False) + + +def generate_check_w4(): + template_path = get_deploy_path('p11_freezoff') + num_clients= 4 + world_size = num_clients + 1 + system_template: dict = load_system_template(template_path=template_path) + + for key, item in enumerate(system_template['services']['fl_server']['environment']): + if item == 'WORLD_SIZE={world_size}': + system_template['services']['fl_server']['environment'][key] = item.format(world_size=world_size) + cpu_idx = 2 + for client_id in range(1, num_clients + 1): + client_type = 'fast' + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + + client_template: dict = load_client_template(type=client_type, template_path=template_path) + client_definition, container_name = generate_client(client_id, client_template, world_size, type=client_type, + cpu_set=cpu_set) + system_template['services'].update(client_definition) + + with open(r'./docker-compose.yml', 'w') as file: + yaml.dump(system_template, file, sort_keys=False) + +def generate_check_w18(): + template_path = get_deploy_path('p11_freezoff') + num_clients= 18 + world_size = num_clients + 1 + system_template: dict = load_system_template(template_path=template_path) + + for key, item in enumerate(system_template['services']['fl_server']['environment']): + if item == 'WORLD_SIZE={world_size}': + system_template['services']['fl_server']['environment'][key] = item.format(world_size=world_size) + cpu_idx = 2 + for client_id in range(1, num_clients + 1): + client_type = 'fast' + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + + client_template: dict = load_client_template(type=client_type, template_path=template_path) + client_definition, container_name = generate_client(client_id, client_template, world_size, type=client_type, + cpu_set=cpu_set) + system_template['services'].update(client_definition) + + with open(r'./docker-compose.yml', 'w') as file: + yaml.dump(system_template, file, sort_keys=False) + + +def generate_check_w18_fast(): + template_path = get_deploy_path('p11_freezoff_fast') + num_clients= 6 + world_size = num_clients + 1 + system_template: dict = load_system_template(template_path=template_path) + + for key, item in enumerate(system_template['services']['fl_server']['environment']): + if item == 'WORLD_SIZE={world_size}': + system_template['services']['fl_server']['environment'][key] = item.format(world_size=world_size) + cpu_idx = 2 + for client_id in range(1, num_clients + 1): + client_type = 'fast' + cpu_set = f'{cpu_idx}-{cpu_idx + 2}' + cpu_idx += 3 + + client_template: dict = load_client_template(type=client_type, template_path=template_path) + client_definition, container_name = generate_client(client_id, client_template, world_size, type=client_type, + cpu_set=cpu_set) + system_template['services'].update(client_definition) + + with open(r'./docker-compose.yml', 'w') as file: + yaml.dump(system_template, file, sort_keys=False) + + +def generate_p11_freezoff(): + template_path = get_deploy_path('p11_freezoff') + num_clients= 18 + world_size = num_clients + 1 + system_template: dict = load_system_template(template_path=template_path) + + for key, item in enumerate(system_template['services']['fl_server']['environment']): + if item == 'WORLD_SIZE={world_size}': + system_template['services']['fl_server']['environment'][key] = item.format(world_size=world_size) + cpu_idx = 2 + for client_id in range(1, num_clients + 1): + client_type = 'default' + if 0 < client_id <= 6: + client_type = 'slow' + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + elif 6 < client_id <= 12: + client_type = 'medium' + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + elif 12 < client_id <= 18: + client_type = 'fast' + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + else: + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + + client_template: dict = load_client_template(type=client_type, template_path=template_path) + client_definition, container_name = generate_client(client_id, client_template, world_size, type=client_type, + cpu_set=cpu_set) + system_template['services'].update(client_definition) + + with open(r'./docker-compose.yml', 'w') as file: + yaml.dump(system_template, file, sort_keys=False) + + +def generate_tifl_15(): + template_path = get_deploy_path('tifl-15') + num_clients= 18 + world_size = num_clients + 1 + system_template: dict = load_system_template(template_path=template_path) + + for key, item in enumerate(system_template['services']['fl_server']['environment']): + if item == 'WORLD_SIZE={world_size}': + system_template['services']['fl_server']['environment'][key] = item.format(world_size=world_size) + cpu_idx = 2 + for client_id in range(1, num_clients + 1): + client_type = 'default' + if 0 < client_id <= 6: + client_type = 'slow' + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + elif 6 < client_id <= 12: + client_type = 'medium' + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + elif 12 < client_id <= 18: + client_type = 'fast' + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + else: + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + + client_template: dict = load_client_template(type=client_type, template_path=template_path) + client_definition, container_name = generate_client(client_id, client_template, world_size, type=client_type, + cpu_set=cpu_set) + system_template['services'].update(client_definition) + + with open(r'./docker-compose.yml', 'w') as file: + yaml.dump(system_template, file, sort_keys=False) + + +def generate_tifl_3(): + template_path = get_deploy_path('tifl-15') + num_clients= 3 + world_size = num_clients + 1 + system_template: dict = load_system_template(template_path=template_path) + + for key, item in enumerate(system_template['services']['fl_server']['environment']): + if item == 'WORLD_SIZE={world_size}': + system_template['services']['fl_server']['environment'][key] = item.format(world_size=world_size) + cpu_idx = 3 + for client_id in range(1, num_clients + 1): + client_type = 'default' + if 0 < client_id <= 1: + client_type = 'slow' + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + elif 1 < client_id <= 2: + client_type = 'medium' + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + elif 2 < client_id <= 3: + client_type = 'fast' + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + else: + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + + client_template: dict = load_client_template(type=client_type, template_path=template_path) + client_definition, container_name = generate_client(client_id, client_template, world_size, type=client_type, + cpu_set=cpu_set) + system_template['services'].update(client_definition) + + with open(r'./docker-compose.yml', 'w') as file: + yaml.dump(system_template, file, sort_keys=False) + +def generate_offload_exp(): + num_clients = 4 + cpu_per_client = 1 + num_cpus = 20 + world_size = num_clients + 1 + system_template: dict = load_system_template() + + for key, item in enumerate(system_template['services']['fl_server']['environment']): + if item == 'WORLD_SIZE={world_size}': + system_template['services']['fl_server']['environment'][key] = item.format(world_size=world_size) + cpu_set = 0 + cpu_idx = 3 + for client_id in range(1, num_clients + 1): + client_type = 'medium' + client_type = 'default' + if client_id == 1 or client_id == 2: + client_type = 'medium' + cpu_set = f'{cpu_idx}-{cpu_idx+1}' + cpu_idx += 2 + elif client_id == 3: + client_type = 'slow' + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + elif client_id == 4: + client_type = 'fast' + cpu_set = f'{cpu_idx}-{cpu_idx + 2}' + cpu_idx += 3 + else: + cpu_set = f'{cpu_idx}' + cpu_idx += 1 + + client_template: dict = load_client_template(type=client_type) + client_definition, container_name = generate_client(client_id, client_template, world_size, type=client_type, cpu_set=cpu_set) + system_template['services'].update(client_definition) + + with open(r'./docker-compose.yml', 'w') as file: + yaml.dump(system_template, file, sort_keys=False) + +def generate(num_clients: int): + world_size = num_clients + 1 + system_template :dict = load_system_template() + + for key, item in enumerate(system_template['services']['fl_server']['environment']): + if item == 'WORLD_SIZE={world_size}': + system_template['services']['fl_server']['environment'][key] = item.format(world_size=world_size) + + for client_id in range(1, num_clients+1): + client_type = 'default' + if client_id == 1: + client_type='slow' + if client_id == 2: + client_type='medium' + client_template: dict = load_client_template(type=client_type) + client_definition, container_name = generate_client(client_id, client_template, world_size, type=client_type) + system_template['services'].update(client_definition) + + with open(r'./docker-compose.yml', 'w') as file: + yaml.dump(system_template, file, sort_keys=False) + +def run(name, num_clients = None, medium=False): + exp_dict = { + 'tifl-15': generate_tifl_15, + 'dev': generate_dev, + 'terminate': generate_terminate, + 'p11_freezoff': generate_p11_freezoff, + 'p13_w6' : generate_p13_w6, + 'p23_w9s3': generate_p23_freezoff_w9s3, + 'p23_w9s3-half': generate_p23_freezoff_w9s3_half, + 'p23_w9s3_fast': generate_p23_freezoff_w9s3_fast, + 'p28_non_iid_effect': generate_p28_non_iid_effect, + 'p30_dev': generate_p30_freezing_effect_dev, + 'generate_check_w4': generate_check_w4, + 'generate_check_w18': generate_check_w18, + 'generate_check_w18_fast': generate_check_w18_fast + } + if num_clients: + exp_dict[name](num_clients, medium) + else: + exp_dict[name]() + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Generate docker-compose file') + parser.add_argument('name', type=str, + help='Name of an experiment') + parser.add_argument('--clients', type=int, help='Set the number of clients in the system', default=None) + args = parser.parse_args() + run(args.name, args.clients) + print('Done') + diff --git a/fltk/util/generate_docker_compose_2.py b/fltk/util/generate_docker_compose_2.py new file mode 100644 index 00000000..28c9025d --- /dev/null +++ b/fltk/util/generate_docker_compose_2.py @@ -0,0 +1,136 @@ +import argparse +import copy +from pathlib import Path +import yaml +import numpy as np + + +def load_yaml_file(file_path: Path): + with open(file_path) as file: + return yaml.full_load(file) + + +def generate_client(id, template: dict, world_size: int, type='default', cpu_set=None, num_cpus=1): + local_template = copy.deepcopy(template) + key_name = list(local_template.keys())[0] + container_name = f'client_{type}_{id}' + local_template[container_name] = local_template.pop(key_name) + for key, item in enumerate(local_template[container_name]['environment']): + if item == 'RANK={rank}': + local_template[container_name]['environment'][key] = item.format(rank=id) + if item == 'WORLD_SIZE={world_size}': + local_template[container_name]['environment'][key] = item.format(world_size=world_size) + local_template[container_name]['ports'] = [f'{5000+id}:5000'] + if cpu_set: + local_template[container_name]['cpuset'] = f'{cpu_set}' + else: + local_template[container_name].pop('cpuset') + local_template[container_name]['deploy']['resources']['limits']['cpus'] = f'{num_cpus}' + return local_template, container_name + + +def gen_client(name: str, client_dict: dict, base_path: Path): + """ + rank (id) + num_cpu + cpu_set + name + """ + client_descr_template = { + 'rank': 0, + 'num_cpu': 1, + 'num_cores': None, + 'name': name, + 'stub-file': 'stub.yml' + } + print(Path.cwd()) + mu = client_dict['cpu-speed'] + sigma = client_dict['cpu-variation'] + n = client_dict['amount'] + np.random.seed(0) + stub_file = base_path / client_dict['stub-name'] + stub_data = load_yaml_file(stub_file) + if client_dict['pin-cores'] is True: + client_descr_template['num_cores'] = client_dict['num-cores'] + client_descr_template['stub-file'] = client_dict['stub-name'] + client_cpu_speeds = np.abs(np.round(np.random.normal(mu, sigma, size=n), 2)) + client_descriptions = [] + for cpu_speed in client_cpu_speeds: + client_descr = copy.deepcopy(client_descr_template) + client_descr['num_cpu'] = cpu_speed + client_descriptions.append(client_descr) + return client_descriptions + + +def generate_clients_proporties(clients_dict: dict, path: Path): + results = [] + for k,v in clients_dict.items(): + results += gen_client(k, v, path) + return results + +def generate_compose_file_from_dict(system: dict): + path = Path(system['base_path']) + client_descriptions = generate_clients_proporties(system['clients'], path) + last_core_id = 0 + world_size = len(client_descriptions) + 1 + system_template_path = path / 'system_stub.yml' + + system_template: dict = load_yaml_file(system_template_path) + + for key, item in enumerate(system_template['services']['fl_server']['environment']): + if item == 'WORLD_SIZE={world_size}': + system_template['services']['fl_server']['environment'][key] = item.format(world_size=world_size) + if system['federator']['pin-cores']: + cpu_set: str + amount = system['federator']['num-cores'] + if amount > 1: + cpu_set = f'{last_core_id}-{last_core_id + amount - 1}' + else: + cpu_set = f'{last_core_id}' + system_template['services']['fl_server']['cpuset'] = cpu_set + last_core_id += amount + else: + system_template['services']['fl_server'].pop('cpuset') + for idx, client_d in enumerate(client_descriptions): + stub_file = path / client_d['stub-file'] + stub_data = load_yaml_file(stub_file) + cpu_set = None + if client_d['num_cores']: + amount = client_d['num_cores'] + if amount > 1: + cpu_set = f'{last_core_id}-{last_core_id + amount - 1}' + else: + cpu_set = f'{last_core_id}' + last_core_id += amount + local_template, container_name = generate_client(idx + 1, stub_data, world_size, client_d['name'], cpu_set, + client_d['num_cpu']) + system_template['services'].update(local_template) + print(container_name) + with open(r'./docker-compose.yml', 'w') as file: + yaml.dump(system_template, file, sort_keys=False) + +def generate_compose_file(path: Path): + """ + Used properties: + - World size + - num clients? + - path to deploy files + - random seed? + """ + + system_path = path / 'description.yml' + system = load_yaml_file(system_path) + # path = Path('deploy/dev_generate') + generate_compose_file_from_dict(system) + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Generate docker-compose file') + parser.add_argument('path', type=str, + help='Path to a deployment config folder') + parser.add_argument('--clients', type=int, help='Set the number of clients in the system', default=None) + args = parser.parse_args() + path = Path(args.path) + results = generate_compose_file(path) + print('done') \ No newline at end of file diff --git a/fltk/util/generate_experiments.py b/fltk/util/generate_experiments.py new file mode 100644 index 00000000..09b39f73 --- /dev/null +++ b/fltk/util/generate_experiments.py @@ -0,0 +1,171 @@ +import copy +from pathlib import Path +import os +import yaml +from fltk.util.generate_docker_compose_2 import generate_compose_file, generate_compose_file_from_dict + + +def rm_tree(pth: Path): + for child in pth.iterdir(): + if child.is_file(): + child.unlink() + # else: + # rm_tree(child) + # pth.rmdir() + + +def check_num_clients_consistency(cfg_data: dict): + if type(cfg_data) is str: + cfg_data = yaml.safe_load(copy.deepcopy(cfg_data)) + + if 'deploy' in cfg_data and 'docker' in cfg_data['deploy']: + num_docker_clients = sum([x['amount'] for x in cfg_data['deploy']['docker']['clients'].values()]) + if cfg_data['num_clients'] != num_docker_clients: + print('[Warning]\t Number of docker clients is not equal to the num_clients property!') + + +def generate(base_path: Path): + descr_path = base_path / 'descr.yaml' + + exp_cfg_list = [x for x in base_path.iterdir() if '.cfg' in x.suffixes] + descr_data = '' + with open(descr_path) as descr_f: + descr_data = descr_f.read() + exps_path = base_path / 'exps' + rm_tree(exps_path) + exps_path.mkdir(parents=True, exist_ok=True) + + check_num_clients_consistency(descr_data) + for exp_cfg in exp_cfg_list: + exp_cfg_data = '' + with open(exp_cfg) as exp_f: + exp_cfg_data = exp_f.read() + + exp_data = descr_data + exp_cfg_data + exp_data += f'\nexperiment_prefix: \'{base_path.name}_{exp_cfg.name.split(".")[0]}\'\n' + filename = '.'.join([exp_cfg.name.split('.')[0], exp_cfg.name.split('.')[2]]) + with open(exps_path / filename, mode='w') as f: + f.write(exp_data) + print('Done') + + +# def run(): +# base_path = Path(__file__).parent +# descr_path = base_path / 'descr.yaml' +# +# exp_cfg_list = [x for x in base_path.iterdir() if '.cfg' in x.suffixes] +# descr_data = '' +# with open(descr_path) as descr_f: +# descr_data = descr_f.read() +# +# exps_path = base_path / 'exps' +# exps_path.mkdir(parents=True, exist_ok=True) +# for exp_cfg in exp_cfg_list: +# exp_cfg_data = '' +# replications = 1 +# with open(exp_cfg) as exp_f: +# exp_cfg_data = exp_f.read() +# for replication_id in range(replications): +# exp_data = descr_data + exp_cfg_data +# exp_data += f'\nexperiment_prefix: \'{Path(__file__).parent.name}_{exp_cfg.name.split(".")[0]}\'\n' +# filename = '.'.join([exp_cfg.name.split('.')[0], exp_cfg.name.split('.')[2]]) +# with open(exps_path / filename, mode='w') as f: +# f.write(exp_data) +# print('Done') + + +def run(base_path: Path): + print(f'Run {base_path}') + print(list(base_path.iterdir())) + descr_path = base_path / 'descr.yaml' + exp_cfg_list = [x for x in base_path.iterdir() if '.cfg' in x.suffixes] + descr_data = '' + with open(descr_path) as descr_f: + descr_data = yaml.safe_load(descr_f.read()) + + replications = 1 + if 'replications' in descr_data: + replications = descr_data['replications'] + run_docker = False + if 'deploy' in descr_data and 'docker' in descr_data['deploy']: + # if 'docker_system' in descr_data: + # Run in docker + # Generate Docker + print(descr_data) + docker_deploy_path = Path(descr_data['deploy']['docker']['base_path']) + + print(docker_deploy_path) + run_docker = True + generate_compose_file_from_dict(descr_data['deploy']['docker']) + # generate_compose_file(docker_deploy_path) + + exp_files = [x for x in (base_path / 'exps').iterdir() if x.suffix in ['.yaml', '.yml']] + + cmd_list = [] + print(exp_files) + if run_docker: + first_prefix = '--build' + for exp_cfg_file in exp_files: + for replication_id in range(replications): + cmd = f'export OPTIONAL_PARAMS="--prefix={replication_id}";export EXP_CONFIG_FILE="{exp_cfg_file}"; docker-compose --compatibility up {first_prefix};' + cmd_list.append(cmd) + # print(f'Running cmd: "{cmd}"') + # os.system(cmd) + first_prefix = '' + pass + else: + print('Switching to direct mode') + for exp_cfg_file in exp_files: + for replication_id in range(replications): + # cmd = f'export OPTIONAL_PARAMS="--prefix={replication_id}";export EXP_CONFIG_FILE="{exp_cfg_file}"; docker-compose --compatibility up {first_prefix};' + cmd = f'python3 -m fltk single {exp_cfg_file} --prefix={replication_id}' + cmd_list.append(cmd) + pass + + [print(x) for x in cmd_list] + for cmd in cmd_list: + print(f'Running cmd: "{cmd}"') + os.system(cmd) + print('Done') + # docker_system + + + # name = 'dev' + # generate_docker(name) + # base_path = f'{Path(__file__).parent}' + # exp_list = [ + # 'fedavg.yaml', + # ] + # exp_list = [f'{base_path}/exps/{x}' for x in exp_list] + # first_prefix = '--build' + # for exp_cfg_file in exp_list: + # cmd = f'export EXP_CONFIG_FILE="{exp_cfg_file}"; docker-compose --compatibility up {first_prefix};' + # print(f'Running cmd: "{cmd}"') + # os.system(cmd) + # first_prefix = '' + + # print('Done') + +# if __name__ == '__main__': +# base_path = Path(__file__).parent +# descr_path = base_path / 'descr.yaml' +# +# exp_cfg_list = [x for x in base_path.iterdir() if '.cfg' in x.suffixes] +# descr_data = '' +# with open(descr_path) as descr_f: +# descr_data = descr_f.read() +# exps_path = base_path / 'exps' +# exps_path.mkdir(parents=True, exist_ok=True) +# for exp_cfg in exp_cfg_list: +# exp_cfg_data = '' +# with open(exp_cfg) as exp_f: +# exp_cfg_data = exp_f.read() +# +# exp_data = descr_data + exp_cfg_data +# exp_data += f'\nexperiment_prefix: \'{Path(__file__).parent.name}_{exp_cfg.name.split(".")[0]}\'\n' +# filename = '.'.join([exp_cfg.name.split('.')[0], exp_cfg.name.split('.')[2]]) +# with open(exps_path / filename, mode='w') as f: +# f.write(exp_data) +# print('Done') +# +# diff --git a/fltk/util/log.py b/fltk/util/log.py new file mode 100644 index 00000000..ba7c3c16 --- /dev/null +++ b/fltk/util/log.py @@ -0,0 +1,20 @@ +import logging + +from torch.distributed import rpc + +from fltk.util.definitions import LogLevel + + +class FLLogger: + @staticmethod + @rpc.functions.async_execution + def log(arg1, node_id, log_line, report_time): + logging.info(f'[{node_id}: {report_time}]: {log_line}') + + +def getLogger(module_name, level: LogLevel = LogLevel.INFO): + logging.basicConfig( + level=level.value, + format='%(asctime)s %(levelname)s %(module)s - %(funcName)s: %(message)s', + ) + return logging.getLogger(module_name) diff --git a/fltk/util/offloading_estimate.py b/fltk/util/offloading_estimate.py new file mode 100644 index 00000000..cfbf25de --- /dev/null +++ b/fltk/util/offloading_estimate.py @@ -0,0 +1,61 @@ + +def calc_optimal_offloading_point(profiler_data, time_till_deadline, iterations_left): + ff, cf, cb, fb = profiler_data + full_network = ff + cf + cb + fb + frozen_network = ff + cf + cb + split_point = 0 + for z in range(iterations_left, -1, -1): + x = z + y = iterations_left - x + # print(z) + new_est_split = (x * full_network) + (y * frozen_network) + split_point = x + if new_est_split < time_till_deadline: + break + + +def estimate(): + """ + freeze_network = ff + cf + cb + fb + frozen_network = ff + cf + cb + + td = time until deadline + cl = cycles left + + a = 1 + b = cl - a + + + """ + np = { + 'a': 2, + 'b': 1, + 'c': 3, + 'd': 4, + } + + sp = { + 'time_left': 400, + 'iter_left': 44 + } + + f_n = np['a'] + np['b'] + np['c'] + np['d'] + o_n = np['a'] + np['b'] + np['c'] + est_full_comp_time = f_n * sp['iter_left'] + new_est = o_n * sp['iter_left'] + x = 20 + y = sp['iter_left'] - x + new_est_split = (x * f_n) + (y * o_n) + + print(f'estimate: {est_full_comp_time} < {sp["time_left"]} ? {est_full_comp_time = first_cls_layer + df.loc[mask, 'type'] = 'classifier' + mask = df['layer_id'] < first_cls_layer + df.loc[mask, 'type'] = 'feature' + + tmp = df.groupby(['execution_id', 'event', 'id_type_combined', 'layer_id', 'type']).time.mean().reset_index() + sorted = tmp.sort_values(['event', 'execution_id'], ascending=[False, True]) + + grouped_df = tmp.groupby(['event', 'type']).sum().reset_index()[['event', 'type', 'time']] + grouped_df['model'] = name + # for idx, row in df.iterrows(): + # print(idx, row) + return grouped_df + + +def parse_stability_data(data: List[pd.DataFrame], save_to_file: bool = False, filename: str = 'stability_data.csv'): + df_list = [] + for idx, df in enumerate(data): + df['idx'] = idx + df_list.append(df) + + combined_df = pd.concat(df_list, ignore_index=True) + if save_to_file: + combined_df.to_csv(filename) + return combined_df + + +def stability_plot(df: pd.DataFrame): + # for idx, df in enumerate(data): + # print(idx) + pass + +def calc_metric(df, start_cls_layer): + df['type'] = 'feature' + mask = df['layer_id'] >= start_cls_layer + df.loc[mask, 'type'] = 'classifier' + mask = df['layer_id'] < start_cls_layer + df.loc[mask, 'type'] = 'feature' + combined = df.groupby(['event', 'type', 'idx']).sum().reset_index() + + features_f: pd.DataFrame = combined[(combined['type'] == 'feature') & (combined['event'] == 'forward')][['time', 'idx']] + classifier_f = combined[(combined['type'] == 'classifier') & (combined['event'] == 'forward')][['time', 'idx']] + features_b = combined[(combined['type'] == 'feature') & (combined['event'] == 'backward')][['time', 'idx']] + classifier_b = combined[(combined['type'] == 'classifier') & (combined['event'] == 'backward')][['time', 'idx']] + + features_f2: pd.DataFrame = combined[(combined['type'] == 'feature') & (combined['event'] == 'forward')] + classifier_f2 = combined[(combined['type'] == 'classifier') & (combined['event'] == 'forward')] + features_b2 = combined[(combined['type'] == 'feature') & (combined['event'] == 'backward')] + classifier_b2 = combined[(combined['type'] == 'classifier') & (combined['event'] == 'backward')] + + plt.figure() + # sns.lineplot(data=pd.concat([features_b2, features_f2, classifier_b2, classifier_f2], ignore_index=True), x='idx', y='time', hue='type') + sns.lineplot(data=pd.concat([features_f2, classifier_b2, classifier_f2], ignore_index=True), x='idx', y='time', hue='event') + plt.title('Weak offloaded Client') + plt.show() + + plt.figure() + # sns.lineplot(data=pd.concat([features_b2, features_f2, classifier_b2, classifier_f2], ignore_index=True), x='idx', y='time', hue='type') + sns.lineplot(data=pd.concat([features_f2, features_b2, classifier_b2, classifier_f2], ignore_index=True), x='idx', y='time', + hue='event') + plt.title('Original Weak client') + plt.show() + plt.figure() + sns.lineplot(data=pd.concat([features_f2, features_b2], ignore_index=True), x='idx', y='time', hue='event') + plt.title('Offload') + plt.show() + + plt.figure() + # sns.lineplot(data=pd.concat([features_b2, features_f2, classifier_b2, classifier_f2], ignore_index=True), x='idx', y='time', hue='type') + sns.lineplot(data=pd.concat([features_f2, classifier_b2, classifier_f2], ignore_index=True), x='idx', y='time') + plt.title('Weak offloaded Client #2') + plt.show() + + plt.figure() + # sns.lineplot(data=pd.concat([features_b2, features_f2, classifier_b2, classifier_f2], ignore_index=True), x='idx', y='time', hue='type') + sns.lineplot(data=pd.concat([features_f2, features_b2, classifier_b2, classifier_f2], ignore_index=True), x='idx', + y='time') + plt.title('Original Weak client #2') + plt.show() + plt.figure() + sns.lineplot(data=pd.concat([features_f2, features_b2], ignore_index=True), x='idx', y='time') + plt.title('Offload #2') + plt.show() + + + features_f.rename(columns={'time': 'time_f_f'}, inplace=True) + classifier_f.rename(columns={'time': 'time_c_f'}, inplace=True) + features_b.rename(columns={'time': 'time_f_b'}, inplace=True) + classifier_b.rename(columns={'time': 'time_c_b'}, inplace=True) + + combined_df = features_f.copy(deep=True) + combined_df = combined_df.merge(classifier_f, on='idx') + combined_df = combined_df.merge(features_b, on='idx') + combined_df = combined_df.merge(classifier_b, on='idx') + + combined_df['offload_time'] = combined_df['time_f_f'] + combined_df['time_f_b'] + combined_df['gained_time'] = combined_df['time_c_f'] + combined_df['time_f_f'] + combined_df['time_f_b'] + + data_list = [] + for _, row in combined_df.iterrows(): + data_list.append([row['offload_time'], 'offload', row['idx']]) + data_list.append([row['gained_time'], 'gained', row['idx']]) + # offload = features_f.copy(deep=True) + # frozen = features_f.copy(deep=True) + # + # offload['time'] += features_b['time'] + # frozen['time'] = classifier_f['time'].values + classifier_b['time'].values + # Compute time of part that is offloaded to strong node + + return pd.DataFrame(data_list, columns=['time', 'type', 'idx']) + +if __name__ == '__main__': + print('Hello world') + + + df = pd.read_csv('stability_data.csv') + calc = calc_metric(df, 15) + + plt.figure() + sns.lineplot(data=calc, x='idx', y='time', hue='type') + plt.show() + # first = df.head(10) + # groups = df.groupby(['idx', 'layer_id', 'event']) + # # df['layer_id'] = pd.to_ + # df['layer_id'] = df['layer_id'].astype(str) + # plt.figure() + # # sns.lineplot(data=df, x='idx', y='time', hue='layer_id') + # g = sns.FacetGrid(df, col="event", hue='layer_id') + # g.map(sns.lineplot, "idx", "time") + # plt.show() + + # for i in groups.groups: + # print(groups.groups[i]) +# +# clean_df = parse_data(df, model_name, meta_data) +# meta_data = { +# 'lenet-5': 6, +# 'alexnet': 13, +# 'vgg16': 13, +# 'cifar_10_cnn': 15 +# } \ No newline at end of file diff --git a/fltk/util/profiler.py b/fltk/util/profiler.py new file mode 100644 index 00000000..d8f6c246 --- /dev/null +++ b/fltk/util/profiler.py @@ -0,0 +1,189 @@ +from dataclasses import dataclass + +import torch +from torch.nn import Module +import time +import pandas as pd + +@dataclass +class Event: + time: int + layer_id: int + name: str + event: str + execution_id: int + + def to_list(self): + return [self.time, self.layer_id, self.name, self.event, f'{self.layer_id}-{self.name}', self.execution_id] + +class Profiler: + current_layer = 0 + event_list = [] + last_time = 0 + execution_id = 0 + last_forward_event = None + warmup = False + hook_handles = [] + + def add(self, event: Event): + if event.layer_id >= 100: + print('Error') + print(event) + for e in self.event_list[-150:]: + print(e) + assert(event.layer_id < 100) + self.event_list.append(event) + + def pre_forward(self, other, input): + if self.warmup: + return None + # print(f'Pre forward: {other.__class__.__name__}') + # self.event_list.append(Event(time.time_ns(), self.current_layer, other.__class__.__name__, "pre_forward")) + self.last_forward_event = Event(time.time_ns(), self.current_layer, other.__class__.__name__, "forward", self.execution_id) + + def forward(self, other, input, output): + if self.warmup: + return None + # print(f'Forward: {other.__class__.__name__}') + self.last_forward_event.time = time.time_ns() - self.last_forward_event.time + # self.event_list.append(self.last_forward_event) + self.add(self.last_forward_event) + self.current_layer += 1 + self.execution_id += 1 + + def backward(self, module, grad_input, grad_output): + # pass + if self.warmup: + return None + # print(f'Backward: {module.__class__.__name__}') + # self.event_list.append(Event(time.time_ns() - self.last_time, self.current_layer, module.__class__.__name__, "backward", self.execution_id)) + self.add(Event(time.time_ns() - self.last_time, self.current_layer, module.__class__.__name__, "backward", self.execution_id)) + self.current_layer -= 1 + self.execution_id += 1 + self.last_time = time.time_ns() + return None + + def signal_backward_start(self): + self.current_layer -= 1 + self.last_time = time.time_ns() + + def signal_forward_start(self): + self.current_layer = 0 + self.execution_id = 0 + self.last_time = None + self.last_forward_event = None + + def print_events(self): + for e in self.event_list: + print(e) + + def to_dataframe(self) -> pd.DataFrame: + data = [x.to_list() for x in self.event_list] + return pd.DataFrame(data, columns = ['time', 'layer_id', 'layer_type', 'event', 'id_type_combined', 'execution_id']) + + def export_data(self): + return self.to_dataframe().groupby(['event', 'layer_id']).mean().reset_index()[['event', 'layer_id', 'time']] + + def reset(self): + self.event_list = [] + + def calc_metric(self, start_cls_layer): + df = self.to_dataframe() + df['type'] = 'feature' + mask = df['layer_id'] >= start_cls_layer + df.loc[mask, 'type'] = 'classifier' + mask = df['layer_id'] < start_cls_layer + df.loc[mask, 'type'] = 'feature' + combined = df.groupby(['event', 'type']).sum().reset_index() + + features_f = combined[(combined['type'] == 'feature') & (combined['event'] == 'forward')]['time'].values[0] + classifier_f = combined[(combined['type'] == 'classifier') & (combined['event'] == 'forward')]['time'].values[0] + features_b = combined[(combined['type'] == 'feature') & (combined['event'] == 'backward')]['time'].values[0] + classifier_b = combined[(combined['type'] == 'classifier') & (combined['event'] == 'backward')]['time'].values[0] + return features_f, features_b, classifier_f, classifier_b + + + def set_warmup(self, value): + self.warmup = value + + def printnorm(self, other, input, output): + # input is a tuple of packed inputs + # output is a Tensor. output.data is the Tensor we are interested + print('Inside ' + other.__class__.__name__ + ' forward') + # print('') + # print('input: ', type(input)) + # print('input[0]: ', type(input[0])) + # print('output: ', type(output)) + # print('') + # print('input size:', input[0].size()) + # print('output size:', output.data.size()) + # print('output norm:', output.data.norm()) + + def remove_all_handles(self): + for handle in self.hook_handles: + handle.remove() + + def attach(self, module: Module): + + def get_children(model: torch.nn.Module): + # get children form model! + children = list(model.children()) + flatt_children = [] + if children == []: + # if model has no children; model is last child! :O + return model + else: + # look for children from children... to the last child! + for child in children: + try: + flatt_children.extend(get_children(child)) + except TypeError: + flatt_children.append(get_children(child)) + return flatt_children + + kids = get_children(module) + + print(module) + for k in kids: + # print(f'Registrating hooks for layer {k}') + h1 = k.register_forward_hook(self.forward) + self.hook_handles.append(h1) + h2 = k.register_forward_pre_hook(self.pre_forward) + self.hook_handles.append(h2) + h3 = k.register_backward_hook(self.backward) + self.hook_handles.append(h3) + # module.register_forward_hook(self.printnorm) + # for name, m in module.named_children(): + # print(f'>> Name: {name}') + # print(f'>> Content: {m.parameters()}') + # for child in module.children(): + # print(f'Registrating hooks for layer {child}') + # child.register_forward_hook(self.forward) + # child.register_forward_pre_hook(self.pre_forward) + # child.register_backward_hook(self.backward) + # child.register_full_backward_hook(self.backward) + + def profile_run(self, module, input, iterations, warmup_time = 0) -> pd.DataFrame: + output = module(input) + g0 = torch.rand_like(output) + + self.attach(module) + module.train() + self.set_warmup(True) + for i in range(warmup_time): # warmup + print('warmup cycle') + self.signal_forward_start() + output = module(input) + self.signal_backward_start() + output.backward(g0) + self.set_warmup(False) + for i in range(iterations): + print(i, end='') + self.signal_forward_start() + output = module(input) + self.signal_backward_start() + output.backward(g0) + print('') + self.print_events() + + return self.to_dataframe() \ No newline at end of file diff --git a/fltk/util/profilerV2.py b/fltk/util/profilerV2.py new file mode 100644 index 00000000..9576f302 --- /dev/null +++ b/fltk/util/profilerV2.py @@ -0,0 +1,171 @@ +import torch +from torch.nn import Module +import time + +import numpy as np + + +class Profiler: + current_layer = 0 + last_time = 0 + execution_id = 0 + last_forward_time = None + warmup = False + hook_handles = [] + + feature_layers_ends: int = 0 + ff: np.ndarray + fb: np.ndarray + cf: np.ndarray + cb: np.ndarray + + batch_idx = 0 + + def __init__(self, rounds: int, feature_layers_ends: int): + self.round = rounds + self.ff = np.zeros(self.round) + self.fb = np.zeros(self.round) + self.cf = np.zeros(self.round) + self.cb = np.zeros(self.round) + self.feature_layers_ends = feature_layers_ends + + def attach(self, module: Module): + def get_children(model: torch.nn.Module): + # get children form model! + children = list(model.children()) + flatt_children = [] + if children == []: + # if model has no children; model is last child! :O + return model + else: + # look for children from children... to the last child! + for child in children: + try: + flatt_children.extend(get_children(child)) + except TypeError: + flatt_children.append(get_children(child)) + return flatt_children + + kids = get_children(module) + + print(module) + for idx, k in enumerate(kids): + # print(f'[{idx}] Registrating hooks for layer {k}') + h1 = k.register_forward_hook(self.forward) + self.hook_handles.append(h1) + h2 = k.register_forward_pre_hook(self.pre_forward) + self.hook_handles.append(h2) + h3 = k.register_backward_hook(self.backward) + self.hook_handles.append(h3) + + def remove_all_handles(self): + for handle in self.hook_handles: + handle.remove() + + def set_warmup(self, value): + self.warmup = value + + def add(self, layer_id, duration, backprogation: bool = False): + is_cls = layer_id > self.feature_layers_ends + if is_cls: + if backprogation: + # use cb + self.cb[self.batch_idx] += duration + else: + # use cf + self.cf[self.batch_idx] += duration + else: + if backprogation: + # use fb + self.fb[self.batch_idx] += duration + else: + # use ff + self.ff[self.batch_idx] += duration + + + def pre_forward(self, other, input): + if self.warmup: + return None + self.last_forward_time = time.time() + + def forward(self, other, input, output): + if self.warmup: + return None + # print(f'Forward: {other.__class__.__name__}') + self.last_forward_time = time.time() - self.last_forward_time + # self.event_list.append(self.last_forward_event) + # self.add(self.last_forward_event) + self.add(self.current_layer, self.last_forward_time, False) + self.current_layer += 1 + self.execution_id += 1 + + def backward(self, module, grad_input, grad_output): + if self.warmup: + return None + # print(f'Backward: {module.__class__.__name__}') + # self.event_list.append(Event(time.time() - self.last_time, self.current_layer, module.__class__.__name__, "backward", self.execution_id)) + self.add(self.current_layer, time.time() - self.last_time, True) + # self.add(Event(time.time() - self.last_time, self.current_layer, module.__class__.__name__, "backward", self.execution_id)) + self.current_layer -= 1 + self.execution_id += 1 + self.last_time = time.time() + return None + + def signal_backward_start(self): + self.current_layer -= 1 + self.last_time = time.time() + + def signal_forward_start(self): + self.current_layer = 0 + self.execution_id = 0 + self.last_time = None + self.last_time = 0 + + def step(self): + self.batch_idx += 1 + + def get_values(self): + """ + Returns the measured values in the following order: ff, cf, cb, fb + ff = feature layers forward propagation + cf = classifier layers forward propagation + cb = feature layers backwards propagation + fb = feature layers backwards propagation + The order is the execution order of forward and then backward propagation of a network + """ + return self.ff, self.cf, self.cb, self.fb + + def aggregate_values(self, from_layer: int = 0): + """ + Returns the measured values in the following order: ff, cf, cb, fb + ff = feature layers forward propagation + cf = classifier layers forward propagation + cb = feature layers backwards propagation + fb = feature layers backwards propagation + The order is the execution order of forward and then backward propagation of a network + """ + return self.ff[from_layer:].mean(), self.cf[from_layer:].mean(), self.fb[from_layer:].mean(), self.cb[ + from_layer:].mean() + + def profile_run(self, module, input, iterations, warmup_time = 0): + output = module(input) + g0 = torch.rand_like(output) + + self.attach(module) + module.train() + self.set_warmup(True) + for i in range(warmup_time): # warmup + print('warmup cycle') + self.signal_forward_start() + output = module(input) + self.signal_backward_start() + output.backward(g0) + self.set_warmup(False) + for i in range(iterations): + print(i, end='') + self.signal_forward_start() + output = module(input) + self.signal_backward_start() + output.backward(g0) + self.step() + print('') \ No newline at end of file diff --git a/fltk/util/profilerV3.py b/fltk/util/profilerV3.py new file mode 100644 index 00000000..ab7d54c7 --- /dev/null +++ b/fltk/util/profilerV3.py @@ -0,0 +1,262 @@ +import torch +from torch.nn import Module +import time + +import numpy as np + + +class Profiler: + current_layer = 0 + last_time = 0 + execution_id = 0 + last_forward_time = None + warmup = False + hook_handles = [] + + feature_layers_ends: int = 0 + ff: np.ndarray + fb: np.ndarray + cf: np.ndarray + cb: np.ndarray + + batch_idx = 0 + + ## Total values needed: + # network_start + # pre_forward_hook(split + 1) + # full_backwards_hook(split) + # backwards_end + # forwards_ends + # Start backwards + + # Intermediate time values + forward_start_time: float + backwards_start_time: float + forward_end_time: float + backwards_end_time: float + pre_forward_post_split_time: float + backwards_split_time: float + + def __init__(self, rounds: int, feature_layers_ends: int): + self.round = rounds + self.ff = np.zeros(self.round) + self.fb = np.zeros(self.round) + self.cf = np.zeros(self.round) + self.cb = np.zeros(self.round) + self.feature_layers_ends = feature_layers_ends + + def attach(self, module: Module): + def get_children(model: torch.nn.Module): + # get children form model! + children = list(model.children()) + flatt_children = [] + if children == []: + # if model has no children; model is last child! :O + return model + else: + # look for children from children... to the last child! + for child in children: + try: + flatt_children.extend(get_children(child)) + except TypeError: + flatt_children.append(get_children(child)) + return flatt_children + + kids = get_children(module) + + print(module) + + # Core idea is to find the following segments + # ff = network start <-> pre_forward_hook(split + 1) + # fb = full_backwards_hook(split) <-> backward ends + # cf = pre_forward_hook(split+ 1) <-> end forward + # cb = start backwards <-> full_backwards_hook(split) + ## Total values needed: + # network_start + # pre_forward_hook(split + 1) + # full_backwards_hook(split) + # backwards_end + # forwards_ends + # Start backwards + + for idx, k in enumerate(kids): + # print(f'[{idx}] Registering hooks for layer {k}') + + if idx == self.feature_layers_ends: + # handle = k.register_full_backward_hook(self.full_backwards) + handle = k.register_backward_hook(self.full_backwards) + self.hook_handles.append(handle) + if idx == self.feature_layers_ends + 1: + handle = k.register_forward_pre_hook(self.pre_forward) + self.hook_handles.append(handle) + # h1 = k.register_forward_hook(self.forward) + # self.hook_handles.append(h1) + # h2 = k.register_forward_pre_hook(self.pre_forward) + # self.hook_handles.append(h2) + # h3 = k.register_backward_hook(self.backward) + # module.register_forward_pre_hook(self.pre_network_forward) + # self.hook_handles.append(h3) + + def full_backwards(self, module, grad_input, grad_output): + self.backwards_split_time = time.time() + self.cb[self.batch_idx] = self.backwards_split_time - self.backwards_start_time + return None + + def pre_forward(self, other, input): + self.pre_forward_post_split_time = time.time() + self.ff[self.batch_idx] = self.pre_forward_post_split_time - self.forward_start_time + # if self.warmup: + # return None + # self.last_forward_time = time.time() + # print('Pre layer hook') + # print('Inside ' + other.__class__.__name__ + ' forward') + + def remove_all_handles(self): + for handle in self.hook_handles: + handle.remove() + + def set_warmup(self, value): + self.warmup = value + + def add(self, layer_id, duration, backprogation: bool = False): + is_cls = layer_id > self.feature_layers_ends + if is_cls: + if backprogation: + # use cb + self.cb[self.batch_idx] += duration + else: + # use cf + self.cf[self.batch_idx] += duration + else: + if backprogation: + # use fb + self.fb[self.batch_idx] += duration + else: + # use ff + self.ff[self.batch_idx] += duration + + + # def pre_forward(self, other, input): + # if self.warmup: + # return None + # self.last_forward_time = time.time() + # print('Pre layer hook') + # print('Inside ' + other.__class__.__name__ + ' forward') + # + # + # def pre_network_forward(self, other, input): + # print('Pre network hook') + # print('Inside ' + other.__class__.__name__ + ' forward') + # + # def forward(self, other, input, output): + # if self.warmup: + # return None + # # print(f'Forward: {other.__class__.__name__}') + # self.last_forward_time = time.time() - self.last_forward_time + # # self.event_list.append(self.last_forward_event) + # # self.add(self.last_forward_event) + # self.add(self.current_layer, self.last_forward_time, False) + # self.current_layer += 1 + # self.execution_id += 1 + + + # def backward(self, module, grad_input, grad_output): + # if self.warmup: + # return None + # # print(f'Backward: {module.__class__.__name__}') + # # self.event_list.append(Event(time.time() - self.last_time, self.current_layer, module.__class__.__name__, "backward", self.execution_id)) + # self.add(self.current_layer, time.time() - self.last_time, True) + # # self.add(Event(time.time() - self.last_time, self.current_layer, module.__class__.__name__, "backward", self.execution_id)) + # self.current_layer -= 1 + # self.execution_id += 1 + # self.last_time = time.time() + # return None + + # Core idea is to find the following segments + # ff = network start <-> pre_forward_hook(split + 1) + # fb = full_backwards_hook(split) <-> backward ends + # cf = pre_forward_hook(split+ 1) <-> end forward + # cb = start backwards <-> full_backwards_hook(split) + def signal_forward_start(self): + self.forward_start_time = time.time() + + def signal_forward_end(self): + self.forward_end_time = time.time() + self.cf[self.batch_idx] = self.forward_end_time - self.pre_forward_post_split_time + + def signal_backwards_start(self): + self.backwards_start_time = time.time() + + + def signal_backwards_end(self): + self.backwards_end_time = time.time() + self.fb[self.batch_idx] = self.backwards_end_time - self.backwards_split_time + + + # def signal_backwards_start_combined(self): + # self.backwards_start_time = time.time() + # self.forward_end_time = time.time() + + # def signal_backward_start(self): + # self.current_layer -= 1 + # self.last_time = time.time() + # + # def signal_forward_start(self): + # self.current_layer = 0 + # self.execution_id = 0 + # self.last_time = None + # self.last_time = 0 + + def step(self): + self.batch_idx += 1 + + def get_values(self): + """ + Returns the measured values in the following order: ff, cf, cb, fb + ff = feature layers forward propagation + cf = classifier layers forward propagation + cb = feature layers backwards propagation + fb = feature layers backwards propagation + The order is the execution order of forward and then backward propagation of a network + """ + return self.ff, self.cf, self.cb, self.fb + + def aggregate_values(self, from_layer: int = 0): + """ + Returns the measured values in the following order: ff, cf, cb, fb + ff = feature layers forward propagation + cf = classifier layers forward propagation + cb = feature layers backwards propagation + fb = feature layers backwards propagation + The order is the execution order of forward and then backward propagation of a network + """ + return self.ff[from_layer:].mean(), self.cf[from_layer:].mean(), self.fb[from_layer:].mean(), self.cb[ + from_layer:].mean() + + def profile_run(self, module, input, iterations, warmup_time = 0): + output = module(input) + g0 = torch.rand_like(output) + + self.attach(module) + module.train() + self.set_warmup(True) + for i in range(warmup_time): # warmup + print('warmup cycle') + self.signal_forward_start() + output = module(input) + self.signal_forward_end() + self.signal_backwards_start() + output.backward(g0) + self.signal_backwards_end() + self.set_warmup(False) + for i in range(iterations): + print(i, end='') + self.signal_forward_start() + output = module(input) + self.signal_forward_end() + self.signal_backwards_start() + output.backward(g0) + self.signal_backwards_end() + self.step() + print('') + print(self.get_values()) \ No newline at end of file diff --git a/fltk/util/remote.py b/fltk/util/remote.py new file mode 100644 index 00000000..0202f92f --- /dev/null +++ b/fltk/util/remote.py @@ -0,0 +1,66 @@ +import time +from typing import Any, List + +from torch.distributed import rpc +from dataclasses import dataclass, field +from torch.futures import Future + +def _call_method(method, rref, *args, **kwargs): + return method(rref.local_value(), *args, **kwargs) + +def _remote_method(method, rref, *args, **kwargs): + args = [method, rref] + list(args) + return rpc.rpc_sync(rref.owner(), _call_method, args=args, kwargs=kwargs) + +def _remote_method_async(method, rref, *args, **kwargs): + args = [method, rref] + list(args) + return rpc.rpc_async(rref.owner(), _call_method, args=args, kwargs=kwargs) + +@dataclass +class TimingRecord: + client_id: str + metric: str + value: Any + epoch: int = None + timestamp: float = field(default_factory=time.time) + + +class ClientRef: + ref = None + name = "" + data_size = 0 + tb_writer = None + timing_data: List[TimingRecord] = [] + + def __init__(self, name, ref, tensorboard_writer): + self.name = name + self.ref = ref + self.tb_writer = tensorboard_writer + self.timing_data = [] + + def __repr__(self): + return self.name + +@dataclass +class AsyncCall: + future: Future + client: ClientRef + start_time: float = 0 + end_time: float = 0 + + def duration(self): + return self.end_time - self.start_time + + +def bind_timing_cb(response_obj: AsyncCall): + def callback(fut): + stop_time = time.time() + response_obj.end_time = stop_time + response_obj.future.then(callback) + +def timed_remote_async_call(client, method, rref, *args, **kwargs): + start_time = time.time() + fut = _remote_method_async(method, rref, *args, **kwargs) + response = AsyncCall(fut, client, start_time=start_time) + bind_timing_cb(response) + return response \ No newline at end of file diff --git a/fltk/util/results.py b/fltk/util/results.py index 9c3333f2..34bde2de 100644 --- a/fltk/util/results.py +++ b/fltk/util/results.py @@ -1,17 +1,28 @@ from dataclasses import dataclass - +from typing import Any import numpy as np @dataclass class EpochData: epoch_id: int - duration_train: int - duration_test: int + num_epochs: int + duration_train: float + duration_test: float loss_train: float accuracy: float loss: float class_precision: np.array class_recall: np.array confusion_mat: np.array + training_process: int client_id: str = None + client_wall_time: float = 0 + global_wall_time: float = 0 + global_epoch_id: int = 0 + + def to_csv_line(self): + delimeter = ',' + values = self.__dict__.values() + values = [str(x) for x in values] + return delimeter.join(values) diff --git a/fltk/util/show_client_distributions.py b/fltk/util/show_client_distributions.py new file mode 100644 index 00000000..c30cfbbb --- /dev/null +++ b/fltk/util/show_client_distributions.py @@ -0,0 +1,263 @@ +import pandas as pd +from tqdm import tqdm + +from fltk.core.distributed.client import Client +from fltk.datasets.distributed import DistCIFAR10Dataset, DistCIFAR100Dataset, DistFashionMNISTDataset, DistDataset +import logging + +from fltk.util.base_config import BareConfig + +logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s %(levelname)s %(module)s - %(funcName)s: %(message)s', +) +# dist_settings = { +# 'uniform':{}, +# 'limit labels': {'seed': 1, 'range':[0.1, 1, 0.1]}, +# 'q sampler': {'seed': 1, 'range':[0.1, 1, 0.1]}, +# 'dirichlet': {'seed': 1, 'range':[0.1, 1, 0.1]}, +# } + +dist_settings = { + # 'uniform':{}, + # 'limit labels flex': {'seed': 1, 'range':[0.1, 1, 0.1]}, + 'n labels': {'seed': 1, 'range':[0.1, 1, 0.1]}, + # 'q sampler': {'seed': 1, 'range':[0.1, 1, 0.1]}, + # 'dirichlet': {'seed': 1, 'range':[0.1, 1, 0.1]}, +} + +num_clients = 10 +class dummy_args: + net = 'Cifar10CNN' + dataset_name = 'cifar10' + # data_sampler = "uniform" #s = "dirichlet" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default) + # data_sampler = "limit labels flex" + data_sampler = "n labels" + # sampler: "uniform" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default) + # data_sampler_args = [0.07, 42] # random seed || random seed || random seed || unused + data_sampler_args = [2 , 42] # random seed || random seed || random seed || unused + DistDatasets = { + 'cifar10': DistCIFAR10Dataset, + 'cifar100': DistCIFAR100Dataset, + 'fashion-mnist': DistFashionMNISTDataset, + } + distributed = True + rank = 0 + world_size = 2 + logger = logging.Logger(__name__) + data_path = 'data' + cuda = False + + def get_net(self): + return self.net + + def init_logger(self, logger): + self.logger = logger + + def get_distributed(self): + return self.distributed + + def get_rank(self): + return self.rank + + def get_world_size(self): + return self.world_size + + def get_sampler(self): + return self.data_sampler + + def get_sampler_args(self): + return tuple(self.data_sampler_args) + def get_logger(self): + return self.logger + + def get_data_path(self): + return self.data_path + +def gen_distribution(name, params): + world_size = num_clients + 1 + datasets = [] + idx2class = None + distributions = {} + for rank in range(world_size): + if rank == 0: + continue + print(f'node {rank}') + args = BareConfig() + args.init_logger(logging) + args.data_sampler = name + + + # args.set_net_by_name('MNISTCNN') + # args.dataset_name = 'mnist' + args.set_net_by_name('FashionMNISTCNN') + args.dataset_name = 'fashion-mnist' + # data_sampler = "uniform" #s = "dirichlet" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default) + # data_sampler = "limit labels flex" + args.data_sampler = "n labels" + args.data_sampler_args = [2 , 42] + args.world_size = world_size + args.rank = rank + dataset: DistDataset = args.DistDatasets[args.dataset_name](args) + datasets.append((args, dataset)) + # test_loader = dataset.get_test_loader() + # train_loader = dataset.get_train_loader() + # class_dict = dataset.train_dataset.class_to_idx + print('Iterating over all items') + batch_size = 16 + # for i, (inputs, labels) in enumerate(dataset.get_train_loader(), 0): + # print(labels) + # print('d') + client = Client("test", None, rank, args.world_size, args) + client.init_dataloader() + train_loader = client.dataset.get_train_loader() + train_loader2 = dataset.get_train_loader() + test_loader = client.dataset.get_test_loader() + test_loader2 = dataset.get_test_loader() + idx2class = {v: k for k, v in train_loader.dataset.class_to_idx.items()} + + count_dict = {k: 0 for k, v in train_loader.dataset.class_to_idx.items()} + for (inputs, labels) in tqdm(train_loader): + for element in labels.numpy(): + # y_lbl = element[1] + y_lbl = idx2class[element] + count_dict[y_lbl] += 1 + if rank not in distributions: + distributions[rank] = {} + distributions[rank]['train'] = count_dict + count_dict = {k: 0 for k, v in train_loader.dataset.class_to_idx.items()} + for (inputs, labels) in tqdm(test_loader): + for element in labels.numpy(): + # y_lbl = element[1] + y_lbl = idx2class[element] + count_dict[y_lbl] += 1 + if rank not in distributions: + distributions[rank] = {} + distributions[rank]['test'] = count_dict + + # return count_dict + + label_data = [] + + for i, data_ in distributions.items(): + for k, v in data_['train'].items(): + label_data.append([i, k, v, 'train', name]) + for k, v in data_['test'].items(): + label_data.append([i, k, v, 'test', name]) + return label_data + +def get_client_distributions(): + + prefix = f'{num_clients}_clients' + all_label_data = [] + for key, value in dist_settings.items(): + print(key, value) + all_label_data += gen_distribution(key, value) + + # + # world_size = num_clients + 1 + # datasets = [] + # idx2class = None + # distributions = {} + # for rank in range(world_size): + # if rank == 0: + # continue + # print(f'node {rank}') + # args = dummy_args() + # args.world_size = world_size + # args.rank = rank + # dataset : DistDataset = args.DistDatasets[args.dataset_name](args) + # datasets.append((args, dataset)) + # test_loader = dataset.get_test_loader() + # train_loader = dataset.get_train_loader() + # class_dict = dataset.train_dataset.class_to_idx + # print('Iterating over all items') + # batch_size = 16 + # # for i, (inputs, labels) in enumerate(dataset.get_train_loader(), 0): + # # print(labels) + # # print('d') + # train_loader = dataset.get_train_loader() + # test_loader = dataset.get_test_loader() + # idx2class = {v: k for k, v in train_loader.dataset.class_to_idx.items()} + # + # count_dict = {k: 0 for k, v in train_loader.dataset.class_to_idx.items()} + # for (inputs, labels) in tqdm(train_loader): + # for element in labels.numpy(): + # # y_lbl = element[1] + # y_lbl = idx2class[element] + # count_dict[y_lbl] += 1 + # if rank not in distributions: + # distributions[rank] = {} + # distributions[rank]['train'] = count_dict + # count_dict = {k: 0 for k, v in train_loader.dataset.class_to_idx.items()} + # for (inputs, labels) in tqdm(test_loader): + # for element in labels.numpy(): + # # y_lbl = element[1] + # y_lbl = idx2class[element] + # count_dict[y_lbl] += 1 + # if rank not in distributions: + # distributions[rank] = {} + # distributions[rank]['test'] = count_dict + # + # # return count_dict + # + # label_data = [] + # + # for i, data_ in distributions.items(): + # for k,v in data_['train'].items(): + # label_data.append([i, k, v, 'train']) + # for k,v in data_['test'].items(): + # label_data.append([i, k, v, 'test']) + + df = pd.DataFrame(all_label_data, columns=['node', 'key', 'value', 'type', 'sampler']) + + import matplotlib.pyplot as plt + import seaborn as sns + + plt.figure() + # g = sns.FacetGrid(df, col='node', row='type') + # g = sns.FacetGrid(df, row='node', col='type') + g = sns.FacetGrid(df, col='node', row='sampler', hue='type') + g.map(sns.barplot, 'key', 'value') + plt.savefig(f'{prefix}_dist_plot.png') + # sns.barplot(data=df, x='key', y='value') + plt.show() + + # print(distributions) + print('Train distribution per client:') + print(df.groupby('node')['value'].sum().reset_index()) + sampler = 1 + distributed = True + rank = 1 + world_size = 2 + data_set = 'cifar10' + + net = 'Cifar10CNN' + dataset = 'cifar10' + sampler = "dirichlet" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default) + # sampler: "uniform" # "limit labels" || "q sampler" || "dirichlet" || "uniform" (default) + sampler_args = [0.07, 42] # random seed || random seed || random seed || unused + + dist_datasets = { + 'cifar10': DistCIFAR10Dataset, + 'cifar100': DistCIFAR100Dataset, + 'fashion-mnist': DistFashionMNISTDataset, + } + + # args = dummy_args() + # ddataset: DistDataset = args.DistDatasets[args.dataset_name](args) + + # print(len(list(ddataset.get_train_loader()))) + # print('Done') + + + + +if __name__ == '__main__': + # if len(sys.argv) <= 1: + # print('Missing arguments') + # exit(1) + # num_clients = sys.argv[1] + # print(f'Calculating client distributions for {num_clients} number of clients') + get_client_distributions() + diff --git a/fltk/util/task/config/parameter.py b/fltk/util/task/config/parameter.py index 9cfb082d..30d198b4 100644 --- a/fltk/util/task/config/parameter.py +++ b/fltk/util/task/config/parameter.py @@ -22,6 +22,7 @@ class HyperParameters: lr: str = field(metadata=config(field_name="learningRate")) lr_decay: str = field(metadata=config(field_name="learningrateDecay")) + @dataclass_json @dataclass(frozen=True) class Priority: diff --git a/fltk/util/timer.py b/fltk/util/timer.py new file mode 100644 index 00000000..bb0d08d9 --- /dev/null +++ b/fltk/util/timer.py @@ -0,0 +1,11 @@ +import time +from contextlib import contextmanager +# from timeit import default_timer + +@contextmanager +def elapsed_timer(): + start = time.time() + elapser = lambda: time.time() - start + yield lambda: elapser() + end = time.time() + elapser = lambda: end-start \ No newline at end of file diff --git a/run_multi_exp.bash b/run_multi_exp.bash new file mode 100644 index 00000000..c86dba06 --- /dev/null +++ b/run_multi_exp.bash @@ -0,0 +1,50 @@ +#!/bin/bash + +## declare an array variable +declare -a arr=( + "configs/exp_p2_vanilla.yaml" + # "configs/experiment_vanilla.yaml" + # "configs/experiment_deadline.yaml" + # "configs/experiment_swyh.yaml" + # "configs/experiment_freeze.yaml" + # "configs/experiment_offload.yaml" + ) +EVENT_FILE="exp_events.txt" +# Check if all files are present +for i in "${arr[@]}" +do +# echo "$i" + if [ ! -f $i ]; then + echo "File not found! Cannot find: $i" +# exit + fi + # or do whatever with individual element of the array +done + +read -p "Do you wish to continue? (y/n)?" choice +case "$choice" in + y|Y ) ;; + n|N ) exit;; + * ) exit;; +esac + +echo "" > $EVENT_FILE + +# Start running experiments +## now loop through the above array +for i in "${arr[@]}" +do + export EXP_CONFIG_FILE="$i" + echo "[$(date +"%T")] Starting $EXP_CONFIG_FILE" + echo "[$(date +"%T")] Starting $EXP_CONFIG_FILE" >> $EVENT_FILE + start_time=$(date +%s) + docker-compose up --build 2>&1 | tee dc_log.txt + end_time=$(date +%s) + # elapsed time with second resolution + elapsed=$(( end_time - start_time )) + echo "[$(date +"%T")] Finished with $EXP_CONFIG_FILE in $elapsed seconds" >> $EVENT_FILE +# docker-compose up + # or do whatever with individual element of the array +done +echo "[$(date +"%T")] Finished all experiments" +echo "[$(date +"%T")] Finished all experiments" >> $EVENT_FILE diff --git a/test_node_synchronous.py b/test_node_synchronous.py new file mode 100644 index 00000000..dfe5cfa6 --- /dev/null +++ b/test_node_synchronous.py @@ -0,0 +1,44 @@ +import os +import sys +import torch +import torch.distributed.rpc as rpc + +from fltk.core.client import Client +from fltk.core.federator import Federator +from fltk.core.node import Node +from fltk.util.config import Config + +if __name__ == '__main__': + world_size = 2 + config = Config() + config.num_clients = world_size - 1 + config.world_size = world_size + config.clients_per_round = 1 + config.epochs = 2 + config.rounds = 20 + config.cuda = True + config.single_machine = True + + fed = Federator('fed0', 0, world_size, config) + fed.run() + + # n1 = Client('c1', 0, world_size, config) + # n2 = Client('c2', 1, world_size, config) + # n3 = Client('c3', 2, world_size, config) + # n1.init_dataloader() + # n2.init_dataloader() + # n3.init_dataloader() + # + # response = n1.message(n2, Client.ping, 'new_sender') + # print(response) + # response = n3.message(n1, Client.ping, 'new_sender', be_weird=True) + # print(response) + # + # _, _, accuracy_n1, _ = n3.message(n1, Client.exec_round, 1) + # _, _, accuracy_n2, _ = n1.message(n2, Client.exec_round, 1) + # _, _, accuracy_n3, _ = n1.message(n3, Client.exec_round, 1) + # print(f'Client n1 has an accuracy of {accuracy_n1}') + # print(f'Client n2 has an accuracy of {accuracy_n2}') + # print(f'Client n3 has an accuracy of {accuracy_n3}') + # + # print(config)