From 2a9e2367f518e46ea50fd573fc7f85b116756847 Mon Sep 17 00:00:00 2001 From: Jeroen Galjaard Date: Mon, 7 Jun 2021 11:58:17 +0200 Subject: [PATCH] Update to large scale experiments --- charts/fltk-values.yaml | 2 +- charts/worker/templates/client-slow.yaml | 5 +- charts/worker/values.yaml | 4 +- configs/cloud_experiment.yaml | 6 +- fltk/client.py | 14 +--- fltk/federator.py | 83 +++++++++++++++++------- 6 files changed, 71 insertions(+), 43 deletions(-) diff --git a/charts/fltk-values.yaml b/charts/fltk-values.yaml index a6247db0..2dead84f 100644 --- a/charts/fltk-values.yaml +++ b/charts/fltk-values.yaml @@ -1,4 +1,4 @@ fltk: - worldsize: 10 + worldsize: 50 config: cloud_experiment.yaml port: 5000 \ No newline at end of file diff --git a/charts/worker/templates/client-slow.yaml b/charts/worker/templates/client-slow.yaml index aa293355..a18772f9 100644 --- a/charts/worker/templates/client-slow.yaml +++ b/charts/worker/templates/client-slow.yaml @@ -43,8 +43,11 @@ spec: cpu: {{ $workercpu }} # 1 GiB ? memory: {{ $workermemory }} + requests: + memory: 1500000000 restartPolicy: Never -status: {} +status: + qosClass: Guaranteed # Helm requires seperation. --- {{- end }} diff --git a/charts/worker/values.yaml b/charts/worker/values.yaml index 5576c851..f46835fc 100644 --- a/charts/worker/values.yaml +++ b/charts/worker/values.yaml @@ -1,3 +1,3 @@ worker: - cpu: 750m - memory: 1300000000 + cpu: 500m + memory: 1500000000 diff --git a/configs/cloud_experiment.yaml b/configs/cloud_experiment.yaml index 985aa75b..566ab044 100644 --- a/configs/cloud_experiment.yaml +++ b/configs/cloud_experiment.yaml @@ -13,7 +13,7 @@ cuda: false experiment_prefix: 'experiment_single_machine' output_location: 'output' tensor_board_active: true -clients_per_round: 10 +clients_per_round: 50 system: federator: # Use the SERVICE provided by the fl-server to connect @@ -21,11 +21,11 @@ system: # Default NIC is eth0 nic: 'eth0' clients: - amount: 10 + amount: 50 poison: seed: 420 ratio: 0.2 attack: type: "flip" config: - - 6: 4 + - 5: 3 diff --git a/fltk/client.py b/fltk/client.py index 437a8581..61ed60f7 100644 --- a/fltk/client.py +++ b/fltk/client.py @@ -89,17 +89,6 @@ def reset_model(self): @return: None @rtype: None """ - # Load the default model - # Delete the network to prevent out of memory exceptions being thrown - try: - del self.net - - # Delete dataloader to prevent out of memory exceptions being thrown - del self.dataset - except Exception as e: - print(f"something went wrong: {e}") - # Load network - self.set_net(self.load_default_model()) # Set loss function for gradient calculation self.loss_function = self.args.get_loss_function()() # Create optimizer (default is SGD): TODO: Move to AdamW? @@ -112,6 +101,7 @@ def reset_model(self): self.args.get_min_lr()) # Reset the epoch counter self.epoch_counter = 0 + self.finished_init = True def ping(self): """ @@ -156,7 +146,6 @@ def init_dataloader(self, pill: PoisonPill = None): # self.dataset = self.args.DistDatasets[self.args.dataset_name](self.args) - self.finished_init = True self.finished_init = True print("Done with init") logging.info('Done with init') @@ -320,6 +309,7 @@ def test(self): def run_epochs(self, num_epoch, pill: PoisonPill = None): """ """ + self.finished_init = False start_time_train = datetime.datetime.now() self.dataset.get_train_sampler().set_epoch_size(num_epoch) loss, weights = self.train(self.epoch_counter, pill) diff --git a/fltk/federator.py b/fltk/federator.py index 28cdeac2..1ddea866 100644 --- a/fltk/federator.py +++ b/fltk/federator.py @@ -112,9 +112,13 @@ def create_clients(self, client_id_triple): self.client_data[id] = [] def update_clients(self, ratio): + # Prevent abrupt ending of the client + self.tb_writer.close() self.tb_writer = SummaryWriter(f'{self.tb_path_base}/{self.config.experiment_prefix}_federator') for client in self.clients: + # Create new writer and close old writer writer = SummaryWriter(f'{self.tb_path_base}/{self.config.experiment_prefix}_client_{client.name}_{ratio}') + client.tb_writer.close() client.tb_writer = writer self.client_data[client.name] = [] @@ -179,7 +183,7 @@ def clients_ready(self): time.sleep(2) logging.info('All clients are ready') - def remote_run_epoch(self, epochs, cur_model: torch.nn.Module, ratio = None): + def remote_run_epoch(self, epochs, cur_model: torch.nn.Module, ratio = None, store_grad=False): responses = [] client_weights = [] selected_clients = self.select_clients(self.config.clients_per_round) @@ -194,13 +198,23 @@ def remote_run_epoch(self, epochs, cur_model: torch.nn.Module, ratio = None): pill = self.attack.get_poison_pill() responses.append((client, _remote_method_async(Client.run_epochs, client.ref, num_epoch=epochs, pill=pill))) self.epoch_counter += epochs - flat_current = flatten_params(cur_model.state_dict()) - for res in responses: - epoch_data, weights = res[1].wait() - # get flatten + accuracy, loss, class_precision, class_recall = self.test_data.test() + # self.tb_writer.add_scalar('training loss', loss, self.epoch_counter * self.test_data.get_client_datasize()) # does not seem to work :( ) + self.tb_writer.add_scalar('accuracy', accuracy, self.epoch_counter * self.test_data.get_client_datasize()) + self.tb_writer.add_scalar('accuracy per epoch', accuracy, self.epoch_counter) + flat_current = None - self.store_gradient(flatten_params(weights) - flat_current, epoch_data.client_id, self.epoch_counter, ratio) + # Test the model before waiting for the model. + self.test_model() + + if store_grad: + flat_current = flatten_params(cur_model.state_dict()) + for res in responses: + epoch_data, weights = res[1].wait() + if store_grad: + # get flatten + self.store_gradient(flatten_params(weights) - flat_current, epoch_data.client_id, self.epoch_counter, ratio) self.client_data[epoch_data.client_id].append(epoch_data) logging.info(f'{res[0]} had a loss of {epoch_data.loss}') logging.info(f'{res[0]} had a epoch data of {epoch_data}') @@ -227,20 +241,8 @@ def remote_run_epoch(self, epochs, cur_model: torch.nn.Module, ratio = None): # test global model logging.info("Testing on global test set") self.test_data.update_nn_parameters(updated_model) - accuracy, loss, class_precision, class_recall = self.test_data.test() - # self.tb_writer.add_scalar('training loss', loss, self.epoch_counter * self.test_data.get_client_datasize()) # does not seem to work :( ) - self.tb_writer.add_scalar('accuracy', accuracy, self.epoch_counter * self.test_data.get_client_datasize()) - self.tb_writer.add_scalar('accuracy per epoch', accuracy, self.epoch_counter) - - responses = [] - for client in self.clients: - responses.append( - (client, _remote_method_async(Client.update_nn_parameters, client.ref, new_params=updated_model))) - - for res in responses: - res[1].wait() - logging.info('Weights are updated') - return self.test_data.net + self.distribute_new_model(updated_model) + return updated_model def update_client_data_sizes(self): responses = [] @@ -275,7 +277,7 @@ def save_epoch_data(self, ratio = None): def ensure_path_exists(self, path): Path(path).mkdir(parents=True, exist_ok=True) - def run(self, ratios = [0.1, 0.2, 0.3] ): + def run(self, ratios = [0.06, 0.12, 0.18] ): """ Main loop of the Federator :return: @@ -286,9 +288,11 @@ def run(self, ratios = [0.1, 0.2, 0.3] ): poison_pill = None save_path = self.config for rat in ratios: - # Get model to calculate gradient updates + # Get model to calculate gradient updates, default is shared between all. model = initialize_default_model(self.config, self.config.get_net()) - + # Re-use the functionality to update + self.distribute_new_model(model.state_dict()) + # Update the clients to point to the newer version. self.update_clients(rat) if self.attack: self.poisoned_workers: List[ClientRef] = self.attack.select_poisoned_workers(self.clients, rat) @@ -296,7 +300,6 @@ def run(self, ratios = [0.1, 0.2, 0.3] ): with open(f"{self.tb_path_base}/config_{rat}_poisoned.txt", 'w') as f: f.writelines(list(map(lambda worker: worker.name, self.poisoned_workers))) poison_pill = self.attack.get_poison_pill() - self.client_reset_model() self.client_load_data(poison_pill) self.ping_all() self.clients_ready() @@ -316,6 +319,11 @@ def run(self, ratios = [0.1, 0.2, 0.3] ): logging.info(f'Saving data') self.save_epoch_data(rat) + # Perform last test on the current model. + self.test_model() + # Reset the model to continue with the next round + self.client_reset_model() + logging.info(f'Federator is stopping') def store_gradient(self, gradient, client_id, epoch, ratio): @@ -337,3 +345,30 @@ def store_gradient(self, gradient, client_id, epoch, ratio): pathlib.Path(directory).mkdir(parents=True, exist_ok=True) # Save using pytorch. torch.save(gradient, f"{directory}/gradient.pt") + + def distribute_new_model(self, updated_model): + """ + Function to update the model on the clients + @return: + @rtype: + """ + responses = [] + for client in self.clients: + responses.append( + (client, _remote_method_async(Client.update_nn_parameters, client.ref, new_params=updated_model))) + + for res in responses: + res[1].wait() + logging.info('Weights are updated') + + def test_model(self): + """ + Function to test the model on the test dataset. + @return: + @rtype: + """ + # Test interleaved to speed up execution, i.e. don't keep the clients waiting. + accuracy, loss, class_precision, class_recall = self.test_data.test() + # self.tb_writer.add_scalar('training loss', loss, self.epoch_counter * self.test_data.get_client_datasize()) # does not seem to work :( ) + self.tb_writer.add_scalar('accuracy', accuracy, self.epoch_counter * self.test_data.get_client_datasize()) + self.tb_writer.add_scalar('accuracy per epoch', accuracy, self.epoch_counter)