diff --git a/Dockerfile b/Dockerfile index 4e249583..cd8fbed3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -29,7 +29,7 @@ RUN --mount=type=cache,target=/root/.cache/pip python3 -m pip install -r require ADD configs configs ADD fltk fltk - +COPY cloud_configs/cloud_experiment.yaml configs/cloud_config.yaml # Install newest version of library RUN python3 -m setup install diff --git a/configs/local_experiment.yaml b/configs/local_experiment.yaml index 737dbcd1..5e09ea8a 100644 --- a/configs/local_experiment.yaml +++ b/configs/local_experiment.yaml @@ -1,5 +1,5 @@ # Experiment configuration -total_epochs: 130 +total_epochs: 1 epochs_per_cycle: 1 wait_for_clients: true net: Cifar10CNN @@ -30,6 +30,5 @@ poison: type: "flip" config: - 5: 3 - - 3: 5 diff --git a/fltk/client.py b/fltk/client.py index 61ed60f7..b9e252a3 100644 --- a/fltk/client.py +++ b/fltk/client.py @@ -99,9 +99,16 @@ def reset_model(self): self.args.get_scheduler_step_size(), self.args.get_scheduler_gamma(), self.args.get_min_lr()) + # Reset logger + self.args.init_logger(logging) # Reset the epoch counter self.epoch_counter = 0 - self.finished_init = True + self.finished_init = False + # Dataset will be re-initialized + del self.dataset + # This will be set afterwards, but we delete possible gradient information. + del self.net + self.set_net(self.load_default_model()) def ping(self): """ @@ -215,6 +222,7 @@ def update_nn_parameters(self, new_params): :param new_params: New weights for the neural network :type new_params: dict """ + self.net.load_state_dict(copy.deepcopy(new_params), strict=True) if self.log_rref: self.remote_log(f'Weights of the model are updated') @@ -235,6 +243,7 @@ def train(self, epoch, pill: PoisonPill = None): if self.args.distributed: self.dataset.train_sampler.set_epoch(epoch) + for i, (inputs, labels) in enumerate(self.dataset.get_train_loader(), 0): inputs, labels = inputs.to(self.device), labels.to(self.device) # TODO: check if these parameters are correct, labels or ouputs? @@ -243,7 +252,7 @@ def train(self, epoch, pill: PoisonPill = None): inputs, labels = pill.poison_output(inputs, labels) # zero the parameter gradients - self.optimizer.zero_grad() + self.optimizer.zero_grad(set_to_none=True) # forward + backward + optimize @@ -254,7 +263,7 @@ def train(self, epoch, pill: PoisonPill = None): self.optimizer.step() # print statistics - running_loss += loss.item() + running_loss += loss.detach().item() if i % self.args.get_log_interval() == 0: self.args.get_logger().info('[%d, %5d] loss: %.3f' % (epoch, i, running_loss / self.args.get_log_interval())) final_running_loss = running_loss / self.args.get_log_interval() @@ -327,7 +336,7 @@ def run_epochs(self, num_epoch, pill: PoisonPill = None): # Copy GPU tensors to CPU for k, v in weights.items(): - weights[k] = v.cpu() + weights[k] = v.cpu().detach() return data, weights def save_model(self, epoch, suffix): diff --git a/fltk/federator.py b/fltk/federator.py index 1ddea866..8defb3e4 100644 --- a/fltk/federator.py +++ b/fltk/federator.py @@ -114,12 +114,13 @@ def create_clients(self, client_id_triple): def update_clients(self, ratio): # Prevent abrupt ending of the client self.tb_writer.close() - self.tb_writer = SummaryWriter(f'{self.tb_path_base}/{self.config.experiment_prefix}_federator') + self.tb_writer = SummaryWriter(f'{self.tb_path_base}/{self.config.experiment_prefix}_federator_{ratio}') for client in self.clients: # Create new writer and close old writer writer = SummaryWriter(f'{self.tb_path_base}/{self.config.experiment_prefix}_client_{client.name}_{ratio}') client.tb_writer.close() client.tb_writer = writer + # Clear client updates ofteraf self.client_data[client.name] = [] @@ -277,7 +278,7 @@ def save_epoch_data(self, ratio = None): def ensure_path_exists(self, path): Path(path).mkdir(parents=True, exist_ok=True) - def run(self, ratios = [0.06, 0.12, 0.18] ): + def run(self, ratios = [0.0, 0.06, 0.12, 0.18]): """ Main loop of the Federator :return: @@ -292,6 +293,7 @@ def run(self, ratios = [0.06, 0.12, 0.18] ): model = initialize_default_model(self.config, self.config.get_net()) # Re-use the functionality to update self.distribute_new_model(model.state_dict()) + # Update the clients to point to the newer version. self.update_clients(rat) if self.attack: @@ -308,6 +310,7 @@ def run(self, ratios = [0.06, 0.12, 0.18] ): addition = 0 epoch_to_run = self.config.epochs epoch_size = self.config.epochs_per_cycle + print(f"Running a total of {epoch_to_run} epochs...") for epoch in range(epoch_to_run): print(f'Running epoch {epoch}') # Get new model during run, update iteratively. The model is needed to calculate the diff --git a/fltk/util/fed_avg.py b/fltk/util/fed_avg.py index e60d1684..77631fa5 100644 --- a/fltk/util/fed_avg.py +++ b/fltk/util/fed_avg.py @@ -7,6 +7,6 @@ def average_nn_parameters(parameters): """ new_params = {} for name in parameters[0].keys(): - new_params[name] = sum([param[name].data for param in parameters]) / len(parameters) + new_params[name] = sum([param[name].data.clone() for param in parameters]) / len(parameters) return new_params