From 33d304fd9b6abfc6652c1d3bdf571e69a89e001d Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 8 Nov 2024 09:05:09 -0800 Subject: [PATCH 01/40] initial xgboost workspace commit Signed-off-by: kta-intel --- openfl-workspace/xgb/.workspace | 1 + openfl-workspace/xgb/plan/cols.yaml | 5 +++++ openfl-workspace/xgb/plan/data.yaml | 5 +++++ openfl-workspace/xgb/plan/defaults | 1 + openfl-workspace/xgb/plan/plan.yaml | 2 ++ openfl-workspace/xgb/requirements.txt | 2 ++ openfl-workspace/xgb/src/__init__.py | 2 ++ openfl-workspace/xgb/src/dataloader.py | 3 +++ openfl-workspace/xgb/src/taskrunner.py | 3 +++ openfl/federated/data/loader_xgb.py | 0 openfl/federated/task/runner_xgb.py | 0 openfl/interface/aggregation_functions/fed_bagging.py | 0 12 files changed, 24 insertions(+) create mode 100644 openfl-workspace/xgb/.workspace create mode 100644 openfl-workspace/xgb/plan/cols.yaml create mode 100644 openfl-workspace/xgb/plan/data.yaml create mode 100644 openfl-workspace/xgb/plan/defaults create mode 100644 openfl-workspace/xgb/plan/plan.yaml create mode 100644 openfl-workspace/xgb/requirements.txt create mode 100644 openfl-workspace/xgb/src/__init__.py create mode 100644 openfl-workspace/xgb/src/dataloader.py create mode 100644 openfl-workspace/xgb/src/taskrunner.py create mode 100644 openfl/federated/data/loader_xgb.py create mode 100644 openfl/federated/task/runner_xgb.py create mode 100644 openfl/interface/aggregation_functions/fed_bagging.py diff --git a/openfl-workspace/xgb/.workspace b/openfl-workspace/xgb/.workspace new file mode 100644 index 0000000000..520b5e57c1 --- /dev/null +++ b/openfl-workspace/xgb/.workspace @@ -0,0 +1 @@ +current_plan_name: default diff --git a/openfl-workspace/xgb/plan/cols.yaml b/openfl-workspace/xgb/plan/cols.yaml new file mode 100644 index 0000000000..5b0f52178d --- /dev/null +++ b/openfl-workspace/xgb/plan/cols.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you. + +# This file lists the collaborators associated with the federation. The list will be auto-populated during collaborator creation. +collaborators: diff --git a/openfl-workspace/xgb/plan/data.yaml b/openfl-workspace/xgb/plan/data.yaml new file mode 100644 index 0000000000..a6825c5ab1 --- /dev/null +++ b/openfl-workspace/xgb/plan/data.yaml @@ -0,0 +1,5 @@ +# Copyright (C) 2024 Intel Corporation +# Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you. + +# This file specifies the local data directory associated with the respective collaborator. This will be auto-populated during collaborator creation +# collaborator_name,data_directory_path \ No newline at end of file diff --git a/openfl-workspace/xgb/plan/defaults b/openfl-workspace/xgb/plan/defaults new file mode 100644 index 0000000000..5042bedbcf --- /dev/null +++ b/openfl-workspace/xgb/plan/defaults @@ -0,0 +1 @@ +../../workspace/plan/defaults \ No newline at end of file diff --git a/openfl-workspace/xgb/plan/plan.yaml b/openfl-workspace/xgb/plan/plan.yaml new file mode 100644 index 0000000000..c61acfc6ed --- /dev/null +++ b/openfl-workspace/xgb/plan/plan.yaml @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you. \ No newline at end of file diff --git a/openfl-workspace/xgb/requirements.txt b/openfl-workspace/xgb/requirements.txt new file mode 100644 index 0000000000..aa6b070230 --- /dev/null +++ b/openfl-workspace/xgb/requirements.txt @@ -0,0 +1,2 @@ +xgboost +scikit-learn \ No newline at end of file diff --git a/openfl-workspace/xgb/src/__init__.py b/openfl-workspace/xgb/src/__init__.py new file mode 100644 index 0000000000..916f3a44b2 --- /dev/null +++ b/openfl-workspace/xgb/src/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 diff --git a/openfl-workspace/xgb/src/dataloader.py b/openfl-workspace/xgb/src/dataloader.py new file mode 100644 index 0000000000..6ae10f6b44 --- /dev/null +++ b/openfl-workspace/xgb/src/dataloader.py @@ -0,0 +1,3 @@ +# Copyright (C) 2024 Intel Corporation +# Licensed subject to the terms of the separately executed evaluation license agreement between +# Intel Corporation and you. \ No newline at end of file diff --git a/openfl-workspace/xgb/src/taskrunner.py b/openfl-workspace/xgb/src/taskrunner.py new file mode 100644 index 0000000000..6ae10f6b44 --- /dev/null +++ b/openfl-workspace/xgb/src/taskrunner.py @@ -0,0 +1,3 @@ +# Copyright (C) 2024 Intel Corporation +# Licensed subject to the terms of the separately executed evaluation license agreement between +# Intel Corporation and you. \ No newline at end of file diff --git a/openfl/federated/data/loader_xgb.py b/openfl/federated/data/loader_xgb.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/openfl/federated/task/runner_xgb.py b/openfl/federated/task/runner_xgb.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/openfl/interface/aggregation_functions/fed_bagging.py b/openfl/interface/aggregation_functions/fed_bagging.py new file mode 100644 index 0000000000..e69de29bb2 From 93dc8b403eeab1bf05c6e6bc8f9ccb1d5c55c79c Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 8 Nov 2024 12:01:03 -0800 Subject: [PATCH 02/40] updating taskrunner and aggregation function Signed-off-by: kta-intel --- openfl-workspace/xgb/plan/plan.yaml | 45 ++- openfl/federated/task/runner_xgb.py | 342 ++++++++++++++++++ .../aggregation_functions/fed_bagging.py | 84 +++++ 3 files changed, 470 insertions(+), 1 deletion(-) diff --git a/openfl-workspace/xgb/plan/plan.yaml b/openfl-workspace/xgb/plan/plan.yaml index c61acfc6ed..97842ded77 100644 --- a/openfl-workspace/xgb/plan/plan.yaml +++ b/openfl-workspace/xgb/plan/plan.yaml @@ -1,2 +1,45 @@ # Copyright (C) 2024 Intel Corporation -# Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you. \ No newline at end of file +# Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you. + +aggregator : + defaults : plan/defaults/aggregator.yaml + template : openfl.component.aggregator.Aggregator + settings : + init_state_path : save/init.pbuf + best_state_path : save/best.pbuf + last_state_path : save/last.pbuf + rounds_to_train : 2 + write_logs : false + +collaborator : + defaults : plan/defaults/collaborator.yaml + template : openfl.component.collaborator.Collaborator + settings : + delta_updates : false + opt_treatment : RESET + +data_loader : + defaults : plan/defaults/data_loader.yaml + template : src.dataloader.TemplateDataLoader + settings : + {} + +task_runner : + defaults : plan/defaults/task_runner.yaml + template : src.taskrunner.TemplateTaskRunner + settings : + {} + +network : + defaults : plan/defaults/network.yaml + settings : + {} + +assigner : + defaults : plan/defaults/assigner.yaml + +tasks : + defaults : plan/defaults/tasks_xgb.yaml + +compression_pipeline : + defaults : plan/defaults/compression_pipeline.yaml \ No newline at end of file diff --git a/openfl/federated/task/runner_xgb.py b/openfl/federated/task/runner_xgb.py index e69de29bb2..ec111a08d5 100644 --- a/openfl/federated/task/runner_xgb.py +++ b/openfl/federated/task/runner_xgb.py @@ -0,0 +1,342 @@ +# Copyright 2020-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +"""XGBoostTaskRunner module.""" + +from copy import deepcopy +from typing import Iterator, Tuple + +import numpy as np +import json + +from openfl.federated.task.runner import TaskRunner +from openfl.utilities import Metric, TensorKey, change_tags +from openfl.utilities.split import split_tensor_dict_for_holdouts + +import xgboost as xgb +from openfl.utilities import LocalTensor +import json +from sklearn.datasets import fetch_california_housing +from sklearn.model_selection import train_test_split +from sklearn.metrics import root_mean_squared_error + + +class XGBoostTaskRunner(TaskRunner): + def __init__(self, **kwargs): + """Initializes the XGBoostTaskRunner object. + + Args: + **kwargs: Additional parameters to pass to the functions. + """ + super().__init__() + TaskRunner.__init__(self, **kwargs) + + # This is a map of all the required tensors for each of the public + # functions in XGBoostTaskRunner + self.required_tensorkeys_for_function = {} + self.training_round_completed = False + + + def validate_task(self, col_name, round_num, input_tensor_dict, use_tqdm=False, **kwargs): + """Validate Task. + + Run validation of the model on the local data. + + Args: + col_name (str): Name of the collaborator. + round_num (int): What round is it. + input_tensor_dict (dict): Required input tensors (for model). + use_tqdm (bool): Use tqdm to print a progress bar (Default=True). + **kwargs: Additional parameters. + + Returns: + global_output_dict (dict): Tensors to send back to the aggregator. + local_output_dict (dict): Tensors to maintain in the local + TensorDB. + """ + if round_num != 0: + self.model = bytearray(input_tensor_dict) + + loader = self.data_loader.get_valid_loader() + + metric = self.validate_(loader) + + origin = col_name + suffix = "validate" + if kwargs["apply"] == "local": + suffix += "_local" + else: + suffix += "_agg" + tags = ("metric",) + tags = change_tags(tags, add_field=suffix) + # TODO figure out a better way to pass in metric for this pytorch + # validate function + output_tensor_dict = {TensorKey(metric.name, origin, round_num, True, tags): metric.value} + + # Empty list represents metrics that should only be stored locally + return output_tensor_dict, {} + + def train_task( + self, + col_name, + round_num, + input_tensor_dict, + use_tqdm=False, + epochs=1, + **kwargs, + ): + """Train batches task. + + Train the model on the requested number of batches. + + Args: + col_name (str): Name of the collaborator. + round_num (int): What round is it. + input_tensor_dict (dict): Required input tensors (for model). + use_tqdm (bool): Use tqdm to print a progress bar (Default=True). + epochs (int): The number of epochs to train. + **kwargs: Additional parameters. + + Returns: + global_output_dict (dict): Tensors to send back to the aggregator. + local_output_dict (dict): Tensors to maintain in the local + TensorDB. + """ + # self.rebuild_model(round_num, input_tensor_dict) + # set to "training" mode + if round_num != 0: + self.model = bytearray(input_tensor_dict) + loader = self.data_loader.get_train_loader() + metric = self.train_(loader) + # Output metric tensors (scalar) + origin = col_name + tags = ("trained",) + output_metric_dict = { + TensorKey(metric.name, origin, round_num, True, ("metric",)): metric.value + } + + # output model tensors (Doesn't include TensorKey) + output_model_dict = self.get_tensor_dict(with_opt_vars=True) + global_model_dict, local_model_dict = split_tensor_dict_for_holdouts( + self.logger, output_model_dict, **self.tensor_dict_split_fn_kwargs + ) + + # Create global tensorkeys + global_tensorkey_model_dict = { + TensorKey(tensor_name, origin, round_num, False, tags): nparray + for tensor_name, nparray in global_model_dict.items() + } + # Create tensorkeys that should stay local + local_tensorkey_model_dict = { + TensorKey(tensor_name, origin, round_num, False, tags): nparray + for tensor_name, nparray in local_model_dict.items() + } + # The train/validate aggregated function of the next round will look + # for the updated model parameters. + # This ensures they will be resolved locally + next_local_tensorkey_model_dict = { + TensorKey(tensor_name, origin, round_num + 1, False, ("model",)): nparray + for tensor_name, nparray in local_model_dict.items() + } + + global_tensor_dict = { + **output_metric_dict, + **global_tensorkey_model_dict, + } + local_tensor_dict = { + **local_tensorkey_model_dict, + **next_local_tensorkey_model_dict, + } + + # Update the required tensors if they need to be pulled from the + # aggregator + # TODO this logic can break if different collaborators have different + # roles between rounds. + # For example, if a collaborator only performs validation in the first + # round but training in the second, it has no way of knowing the + # optimizer state tensor names to request from the aggregator because + # these are only created after training occurs. A work around could + # involve doing a single epoch of training on random data to get the + # optimizer names, and then throwing away the model. + if self.opt_treatment == "CONTINUE_GLOBAL": + self.initialize_tensorkeys_for_functions(with_opt_vars=True) + + # This will signal that the optimizer values are now present, + # and can be loaded when the model is rebuilt + self.training_round_completed = True + + # Return global_tensor_dict, local_tensor_dict + return global_tensor_dict, local_tensor_dict + + + def get_tensor_dict(self, with_opt_vars=False): + """Return the tensor dictionary. + + Args: + with_opt_vars (bool): Return the tensor dictionary including the + optimizer tensors (Default=False) + + Returns: + state (dict): Tensor dictionary {**dict, **optimizer_dict} + """ + # Gets information regarding tensor model layers and optimizer state. + # FIXME: self.parameters() instead? Unclear if load_state_dict() or + # simple assignment is better + # for now, state dict gives us names which is good + # FIXME: do both and sanity check each time? + + state = to_cpu_numpy(self.state_dict()) + + if with_opt_vars: + opt_state = _get_optimizer_state(self.optimizer) + state = {**state, **opt_state} + + return state + + + def get_required_tensorkeys_for_function(self, func_name, **kwargs): + """Get the required tensors for specified function that could be called + as part of a task. By default, this is just all of the layers and + optimizer of the model. + + Args: + func_name (str): The function name. + + Returns: + list : [TensorKey]. + """ + if func_name == "validate_task": + local_model = "apply=" + str(kwargs["apply"]) + return self.required_tensorkeys_for_function[func_name][local_model] + else: + return self.required_tensorkeys_for_function[func_name] + + def initialize_tensorkeys_for_functions(self, with_opt_vars=False): + """Set the required tensors for all publicly accessible task methods. + + By default, this is just all of the layers and optimizer of the model. + Custom tensors should be added to this function. + + Args: + with_opt_vars (bool): Flag to check if optimizer variables are + included. Defaults to False. + + Returns: + None + """ + # TODO there should be a way to programmatically iterate through + # all of the methods in the class and declare the tensors. + # For now this is done manually + + output_model_dict = self.get_tensor_dict(with_opt_vars=with_opt_vars) + global_model_dict, local_model_dict = split_tensor_dict_for_holdouts( + self.logger, output_model_dict, **self.tensor_dict_split_fn_kwargs + ) + if not with_opt_vars: + global_model_dict_val = global_model_dict + local_model_dict_val = local_model_dict + else: + output_model_dict = self.get_tensor_dict(with_opt_vars=False) + global_model_dict_val, local_model_dict_val = split_tensor_dict_for_holdouts( + self.logger, + output_model_dict, + **self.tensor_dict_split_fn_kwargs, + ) + + self.required_tensorkeys_for_function["train_task"] = [ + TensorKey(tensor_name, "GLOBAL", 0, False, ("model",)) + for tensor_name in global_model_dict + ] + self.required_tensorkeys_for_function["train_task"] += [ + TensorKey(tensor_name, "LOCAL", 0, False, ("model",)) + for tensor_name in local_model_dict + ] + + self.required_tensorkeys_for_function["train_task"] = [ + TensorKey(tensor_name, "GLOBAL", 0, False, ("model",)) + for tensor_name in global_model_dict + ] + self.required_tensorkeys_for_function["train_task"] += [ + TensorKey(tensor_name, "LOCAL", 0, False, ("model",)) + for tensor_name in local_model_dict + ] + + # Validation may be performed on local or aggregated (global) model, + # so there is an extra lookup dimension for kwargs + self.required_tensorkeys_for_function["validate_task"] = {} + # TODO This is not stateless. The optimizer will not be + self.required_tensorkeys_for_function["validate_task"]["apply=local"] = [ + TensorKey(tensor_name, "LOCAL", 0, False, ("trained",)) + for tensor_name in {**global_model_dict_val, **local_model_dict_val} + ] + self.required_tensorkeys_for_function["validate_task"]["apply=global"] = [ + TensorKey(tensor_name, "GLOBAL", 0, False, ("model",)) + for tensor_name in global_model_dict_val + ] + self.required_tensorkeys_for_function["validate_task"]["apply=global"] += [ + TensorKey(tensor_name, "LOCAL", 0, False, ("model",)) + for tensor_name in local_model_dict_val + ] + + def save_native( + self, + filepath, + model_state_dict_key="model_state_dict", + optimizer_state_dict_key="optimizer_state_dict", + **kwargs, + ): + """Save model and optimizer states in a picked file specified by the + filepath. model_/optimizer_state_dicts are stored in the keys provided. + Uses pt.save(). + + Args: + filepath (str): Path to pickle file to be created by pt.save(). + model_state_dict_key (str): key for model state dict in pickled + file. + optimizer_state_dict_key (str): key for optimizer state dict in + picked file. + **kwargs: Additional parameters. + + Returns: + None + """ + pickle_dict = { + model_state_dict_key: self.state_dict(), + optimizer_state_dict_key: self.optimizer.state_dict(), + } + torch.save(pickle_dict, filepath) + + def train_(self, train_dataloader: Iterator[Tuple[np.ndarray, np.ndarray]]) -> Metric: + """Train single epoch. + + Override this function in order to use custom training. + + Args: + batch_generator (Iterator): Train dataset batch generator. Yields + (samples, targets) tuples of + size = `self.data_loader.batch_size`. + + Returns: + Metric: An object containing name and np.ndarray value. + """ + losses = [] + for data, target in train_dataloader: + data, target = torch.tensor(data).to(self.device), torch.tensor(target).to(self.device) + self.optimizer.zero_grad() + output = self(data) + loss = self.loss_fn(output=output, target=target) + loss.backward() + self.optimizer.step() + losses.append(loss.detach().cpu().numpy()) + loss = np.mean(losses) + return Metric(name=self.loss_fn.__name__, value=np.array(loss)) + + def validate_(self, validation_dataloader) -> Metric: + """Validate model.""" + + dtest, y_test = validation_dataloader + preds = bst.predict(dtest) + rmse = root_mean_squared_error(y_test, preds) + + return Metric(name="accuracy", value=np.array(rmse)) diff --git a/openfl/interface/aggregation_functions/fed_bagging.py b/openfl/interface/aggregation_functions/fed_bagging.py index e69de29bb2..1f2cb1ca40 100644 --- a/openfl/interface/aggregation_functions/fed_bagging.py +++ b/openfl/interface/aggregation_functions/fed_bagging.py @@ -0,0 +1,84 @@ +# Copyright 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +"""Federated Boostrap Aggregation for XGBoost module.""" + +import json +from openfl.interface.aggregation_functions.core import AggregationFunction + + +def verify_global_model(global_model, local_model, num_global_trees): + for i in range(num_global_trees): + global_tree = global_model['learner']['gradient_booster']['model']['trees'][i] + global_tree_local = local_model['learner']['gradient_booster']['model']['trees'][i] + + assert global_tree == global_tree_local, \ + "Mismatch found in trees. Models are not from the same global model." + + +class FedBaggingXGBoost(AggregationFunction): + """Federated Boostrap Aggregation for XGBoost.""" + + def call(self, local_tensors, *_): + """Aggregate tensors. + + Args: + local_tensors (list[openfl.utilities.LocalTensor]): List of local + tensors to aggregate. + db_iterator: iterator over history of all tensors. Columns: + - 'tensor_name': name of the tensor. + Examples for `torch.nn.Module`s: 'conv1.weight','fc2.bias'. + - 'round': 0-based number of round corresponding to this + tensor. + - 'tags': tuple of tensor tags. Tags that can appear: + - 'model' indicates that the tensor is a model parameter. + - 'trained' indicates that tensor is a part of a training + result. + These tensors are passed to the aggregator node after + local learning. + - 'aggregated' indicates that tensor is a result of + aggregation. + These tensors are sent to collaborators for the next + round. + - 'delta' indicates that value is a difference between + rounds for a specific tensor. + also one of the tags is a collaborator name + if it corresponds to a result of a local task. + + - 'nparray': value of the tensor. + tensor_name: name of the tensor + fl_round: round number + tags: tuple of tags for this tensor + Returns: + bytearray: aggregated tensor + """ + global_model = None + + for local_tensor in local_tensors: + if global_model is None: + global_model = json.loads(local_tensor.tensor['local_tree']) + else: + local_model = json.loads(local_tensor.tensor['local_tree']) + + # Assertion to check if the original trees in the local model match the global model trees + num_global_trees = local_tensor.tensor['num_global_trees'] + verify_global_model(global_model, local_model, num_global_trees) + + num_global_trees = int(global_model["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) + num_latest_trees = local_tensor.tensor['num_latest_trees'] + local_trees = local_model['learner']['gradient_booster']['model']['trees'][-num_latest_trees:] + + global_model["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"] = str( + num_global_trees + num_latest_trees + ) + global_model["learner"]["gradient_booster"]["model"]["iteration_indptr"].append( + num_global_trees + num_latest_trees + ) + + for new_tree in range(num_latest_trees): + local_trees[new_tree]["id"] = num_global_trees + new_tree + global_model["learner"]["gradient_booster"]["model"]["trees"].append(local_trees[new_tree]) + global_model["learner"]["gradient_booster"]["model"]["tree_info"].append(0) + + return bytearray(json.dumps(global_model), "utf-8") From 52fea847fd7195a6a12d4bbbff4a7cdb24f9d84b Mon Sep 17 00:00:00 2001 From: kta-intel Date: Tue, 12 Nov 2024 14:42:56 -0800 Subject: [PATCH 03/40] runner updates Signed-off-by: kta-intel --- openfl/federated/task/runner_xgb.py | 169 ++++++++++++++-------------- 1 file changed, 84 insertions(+), 85 deletions(-) diff --git a/openfl/federated/task/runner_xgb.py b/openfl/federated/task/runner_xgb.py index ec111a08d5..b2d48322a0 100644 --- a/openfl/federated/task/runner_xgb.py +++ b/openfl/federated/task/runner_xgb.py @@ -19,7 +19,7 @@ import json from sklearn.datasets import fetch_california_housing from sklearn.model_selection import train_test_split -from sklearn.metrics import root_mean_squared_error +from sklearn.metrics import r2_score class XGBoostTaskRunner(TaskRunner): @@ -34,11 +34,21 @@ def __init__(self, **kwargs): # This is a map of all the required tensors for each of the public # functions in XGBoostTaskRunner + self.bst = None # TODO + self.global_model = None # TODO + self.params = kwargs['params'] # TODO + self.num_rounds = kwargs['num_rounds'] # TODO + self.required_tensorkeys_for_function = {} self.training_round_completed = False + def rebuild_model(self, input_tensor_dict): + if input_tensor_dict is not None: + self.global_model = bytearray(input_tensor_dict) + self.bst = xgb.Booster() + self.bst.load_model(self.global_model) - def validate_task(self, col_name, round_num, input_tensor_dict, use_tqdm=False, **kwargs): + def validate_task(self, col_name, round_num, input_tensor_dict, **kwargs): """Validate Task. Run validation of the model on the local data. @@ -55,12 +65,19 @@ def validate_task(self, col_name, round_num, input_tensor_dict, use_tqdm=False, local_output_dict (dict): Tensors to maintain in the local TensorDB. """ - if round_num != 0: - self.model = bytearray(input_tensor_dict) - + # during agg validation, self.bst will still be None. during local validation, it will have a value - no need to rebuild + # if self.bst is still None after rebuilding, then there was no initial global model, so set metric to average loader = self.data_loader.get_valid_loader() + # if round_num != 0: + # self.global_model = bytearray(input_tensor_dict) - metric = self.validate_(loader) + if self.bst is None: + self.rebuild_model(input_tensor_dict) + + if round_num == 0: # if self.bst is None: + metric = Metric(name="accuracy", value=np.array(0)) # for first round, there is no global model, so set metric to 0 + else: + metric = self.validate_(loader) origin = col_name suffix = "validate" @@ -82,7 +99,6 @@ def train_task( col_name, round_num, input_tensor_dict, - use_tqdm=False, epochs=1, **kwargs, ): @@ -105,8 +121,9 @@ def train_task( """ # self.rebuild_model(round_num, input_tensor_dict) # set to "training" mode - if round_num != 0: - self.model = bytearray(input_tensor_dict) + # if round_num != 0: + # self.global_model = bytearray(input_tensor_dict) + self.rebuild_model(input_tensor_dict) loader = self.data_loader.get_train_loader() metric = self.train_(loader) # Output metric tensors (scalar) @@ -167,33 +184,28 @@ def train_task( self.training_round_completed = True # Return global_tensor_dict, local_tensor_dict - return global_tensor_dict, local_tensor_dict - + return global_tensor_dict, local_tensor_dict - def get_tensor_dict(self, with_opt_vars=False): - """Return the tensor dictionary. - - Args: - with_opt_vars (bool): Return the tensor dictionary including the - optimizer tensors (Default=False) - - Returns: - state (dict): Tensor dictionary {**dict, **optimizer_dict} - """ - # Gets information regarding tensor model layers and optimizer state. - # FIXME: self.parameters() instead? Unclear if load_state_dict() or - # simple assignment is better - # for now, state dict gives us names which is good - # FIXME: do both and sanity check each time? - - state = to_cpu_numpy(self.state_dict()) + def get_tensor_dict(self): + if self.global_model is None: + global_model_booster_dict = None + num_global_trees = 0 + else: + global_model_booster_dict = json.loads(bytearray(self.global_model)) + num_global_trees = int(global_model_booster_dict["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) - if with_opt_vars: - opt_state = _get_optimizer_state(self.optimizer) - state = {**state, **opt_state} + booster_array = self.bst.save_raw('json').decode('utf-8') + booster_dict = json.loads(booster_array) + num_total_trees = int(booster_dict["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) - return state + # Calculate the number of trees added in the latest training + num_latest_trees = num_total_trees - num_global_trees + return { + 'local_tree': booster_array, + 'num_global_trees': int(num_global_trees), + 'num_latest_trees': int(num_latest_trees) + } def get_required_tensorkeys_for_function(self, func_name, **kwargs): """Get the required tensors for specified function that could be called @@ -279,64 +291,51 @@ def initialize_tensorkeys_for_functions(self, with_opt_vars=False): for tensor_name in local_model_dict_val ] - def save_native( - self, - filepath, - model_state_dict_key="model_state_dict", - optimizer_state_dict_key="optimizer_state_dict", - **kwargs, - ): - """Save model and optimizer states in a picked file specified by the - filepath. model_/optimizer_state_dicts are stored in the keys provided. - Uses pt.save(). - - Args: - filepath (str): Path to pickle file to be created by pt.save(). - model_state_dict_key (str): key for model state dict in pickled - file. - optimizer_state_dict_key (str): key for optimizer state dict in - picked file. - **kwargs: Additional parameters. - - Returns: - None - """ - pickle_dict = { - model_state_dict_key: self.state_dict(), - optimizer_state_dict_key: self.optimizer.state_dict(), - } - torch.save(pickle_dict, filepath) - - def train_(self, train_dataloader: Iterator[Tuple[np.ndarray, np.ndarray]]) -> Metric: - """Train single epoch. - - Override this function in order to use custom training. - - Args: - batch_generator (Iterator): Train dataset batch generator. Yields - (samples, targets) tuples of - size = `self.data_loader.batch_size`. + # def save_native( + # self, + # filepath, + # model_state_dict_key="model_state_dict", + # optimizer_state_dict_key="optimizer_state_dict", + # **kwargs, + # ): + # """Save model and optimizer states in a picked file specified by the + # filepath. model_/optimizer_state_dicts are stored in the keys provided. + # Uses pt.save(). + + # Args: + # filepath (str): Path to pickle file to be created by pt.save(). + # model_state_dict_key (str): key for model state dict in pickled + # file. + # optimizer_state_dict_key (str): key for optimizer state dict in + # picked file. + # **kwargs: Additional parameters. + + # Returns: + # None + # """ + # pickle_dict = { + # model_state_dict_key: self.state_dict(), + # optimizer_state_dict_key: self.optimizer.state_dict(), + # } + # torch.save(pickle_dict, filepath) + + def train_(self, train_dataloader) -> Metric: + """Train model.""" + dtrain = train_dataloader + evals = [(dtrain, 'train')] + evals_result = {} + + self.bst = xgb.train(self.params, dtrain, self.num_rounds, xgb_model=self.bst, + evals=evals, evals_result=evals_result, verbose_eval=False) - Returns: - Metric: An object containing name and np.ndarray value. - """ - losses = [] - for data, target in train_dataloader: - data, target = torch.tensor(data).to(self.device), torch.tensor(target).to(self.device) - self.optimizer.zero_grad() - output = self(data) - loss = self.loss_fn(output=output, target=target) - loss.backward() - self.optimizer.step() - losses.append(loss.detach().cpu().numpy()) - loss = np.mean(losses) + loss = evals_result['train']['rmse'][-1] return Metric(name=self.loss_fn.__name__, value=np.array(loss)) def validate_(self, validation_dataloader) -> Metric: """Validate model.""" dtest, y_test = validation_dataloader - preds = bst.predict(dtest) - rmse = root_mean_squared_error(y_test, preds) + preds = self.bst.predict(dtest) + r2 = r2_score(y_test, preds) - return Metric(name="accuracy", value=np.array(rmse)) + return Metric(name="accuracy", value=np.array(r2)) From 1275fd6813d73d47ec039bd702406b6fd8f45242 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Tue, 12 Nov 2024 14:55:57 -0800 Subject: [PATCH 04/40] logic for loader Signed-off-by: kta-intel --- openfl/federated/data/loader_xgb.py | 166 ++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) diff --git a/openfl/federated/data/loader_xgb.py b/openfl/federated/data/loader_xgb.py index e69de29bb2..f43244f310 100644 --- a/openfl/federated/data/loader_xgb.py +++ b/openfl/federated/data/loader_xgb.py @@ -0,0 +1,166 @@ +import numpy as np +import xgboost as xgb +from math import ceil + +class XGBoostDataLoader: + """A class used to represent a Data Loader for XGBoost models. + + Attributes: + batch_size (int): Size of batches used for all data loaders. + X_train (np.array): Training features. + y_train (np.array): Training labels. + X_valid (np.array): Validation features. + y_valid (np.array): Validation labels. + random_seed (int, optional): Random seed for data shuffling. + """ + + def __init__(self, batch_size, random_seed=None, **kwargs): + """Initializes the XGBoostDataLoader object with the batch size, random + seed, and any additional arguments. + + Args: + batch_size (int): The size of batches used for all data loaders. + random_seed (int, optional): Random seed for data shuffling. + kwargs: Additional arguments to pass to the function. + """ + self.batch_size = batch_size + self.X_train = None + self.y_train = None + self.X_valid = None + self.y_valid = None + self.random_seed = random_seed + + # Child classes should have init signature: + # (self, batch_size, **kwargs), should call this __init__ and then + # define self.X_train, self.y_train, self.X_valid, and self.y_valid + + def get_feature_shape(self): + """Returns the shape of an example feature array. + + Returns: + tuple: The shape of an example feature array. + """ + return self.X_train[0].shape + + def get_train_loader(self, batch_size=None, num_batches=None): + """Returns the data loader for the training data. + + Args: + batch_size (int, optional): The batch size for the data loader + (default is None). + num_batches (int, optional): The number of batches for the data + loader (default is None). + + Returns: + generator: The generator object for the training data. + """ + return self._get_batch_generator( + X=self.X_train, + y=self.y_train, + batch_size=batch_size, + num_batches=num_batches, + ) + + def get_valid_loader(self, batch_size=None): + """Returns the data loader for the validation data. + + Args: + batch_size (int, optional): The batch size for the data loader + (default is None). + + Returns: + generator: The generator object for the validation data. + """ + return self._get_batch_generator(X=self.X_valid, y=self.y_valid, batch_size=batch_size) + + def get_train_data_size(self): + """Returns the total number of training samples. + + Returns: + int: The total number of training samples. + """ + return self.X_train.shape[0] + + def get_valid_data_size(self): + """Returns the total number of validation samples. + + Returns: + int: The total number of validation samples. + """ + return self.X_valid.shape[0] + + @staticmethod + def _batch_generator(X, y, idxs, batch_size, num_batches): + """Generates batches of data. + + Args: + X (np.array): The input data. + y (np.array): The label data. + idxs (np.array): The index of the dataset. + batch_size (int): The batch size for the data loader. + num_batches (int): The number of batches. + + Yields: + tuple: The input data and label data for each batch. + """ + for i in range(num_batches): + a = i * batch_size + b = a + batch_size + yield X[idxs[a:b]], y[idxs[a:b]] + + def _get_batch_generator(self, X, y, batch_size, num_batches=None): + """Returns the dataset generator. + + Args: + X (np.array): The input data. + y (np.array): The label data. + batch_size (int): The batch size for the data loader. + num_batches (int, optional): The number of batches (default is + None). + + Returns: + generator: The dataset generator. + """ + if batch_size is None: + batch_size = self.batch_size + + # shuffle data indices + if self.random_seed is not None: + np.random.seed(self.random_seed) + + idxs = np.random.permutation(np.arange(X.shape[0])) + + # compute the number of batches + if num_batches is None: + num_batches = ceil(X.shape[0] / batch_size) + + # build the generator and return it + return self._batch_generator(X, y, idxs, batch_size, num_batches) + + def get_dmatrix(self, X, y): + """Returns the DMatrix for the given data. + + Args: + X (np.array): The input data. + y (np.array): The label data. + + Returns: + xgb.DMatrix: The DMatrix object for the given data. + """ + return xgb.DMatrix(data=X, label=y) + + def get_train_dmatrix(self): + """Returns the DMatrix for the training data. + + Returns: + xgb.DMatrix: The DMatrix object for the training data. + """ + return self.get_dmatrix(self.X_train, self.y_train) + + def get_valid_dmatrix(self): + """Returns the DMatrix for the validation data. + + Returns: + xgb.DMatrix: The DMatrix object for the validation data. + """ + return self.get_dmatrix(self.X_valid, self.y_valid) \ No newline at end of file From 49f5cdffbd00283587cfa7af792ecde822d24ddd Mon Sep 17 00:00:00 2001 From: kta-intel Date: Wed, 13 Nov 2024 15:50:08 -0800 Subject: [PATCH 05/40] enabling work Signed-off-by: kta-intel --- .../workspace/plan/defaults/tasks_xgb.yaml | 21 ++++ openfl-workspace/xgb/plan/plan.yaml | 13 ++- openfl-workspace/xgb/src/dataloader.py | 29 ++++- openfl-workspace/xgb/src/setup_data.py | 94 ++++++++++++++++ openfl-workspace/xgb/src/taskrunner.py | 55 +++++++++- openfl/federated/__init__.py | 5 + openfl/federated/data/__init__.py | 5 + openfl/federated/data/loader_xgb.py | 12 +- openfl/federated/task/__init__.py | 3 + openfl/federated/task/runner_xgb.py | 103 ++++++++++++------ .../aggregation_functions/__init__.py | 1 + .../aggregation_functions/fed_bagging.py | 63 +++++++---- openfl/interface/plan.py | 1 - 13 files changed, 338 insertions(+), 67 deletions(-) create mode 100644 openfl-workspace/workspace/plan/defaults/tasks_xgb.yaml create mode 100644 openfl-workspace/xgb/src/setup_data.py diff --git a/openfl-workspace/workspace/plan/defaults/tasks_xgb.yaml b/openfl-workspace/workspace/plan/defaults/tasks_xgb.yaml new file mode 100644 index 0000000000..b61942d4f7 --- /dev/null +++ b/openfl-workspace/workspace/plan/defaults/tasks_xgb.yaml @@ -0,0 +1,21 @@ +aggregated_model_validation: + function : validate_task + kwargs : + apply : global + metrics : + - acc + +locally_tuned_model_validation: + function : validate_task + kwargs : + apply: local + metrics : + - acc + +train: + function : train_task + kwargs : + metrics : + - loss + aggregation_type : + template : openfl.interface.aggregation_functions.FedBaggingXGBoost \ No newline at end of file diff --git a/openfl-workspace/xgb/plan/plan.yaml b/openfl-workspace/xgb/plan/plan.yaml index 97842ded77..53de425499 100644 --- a/openfl-workspace/xgb/plan/plan.yaml +++ b/openfl-workspace/xgb/plan/plan.yaml @@ -20,15 +20,20 @@ collaborator : data_loader : defaults : plan/defaults/data_loader.yaml - template : src.dataloader.TemplateDataLoader + template : src.dataloader.HiggsDataLoader settings : - {} + input_shape : 28 task_runner : defaults : plan/defaults/task_runner.yaml - template : src.taskrunner.TemplateTaskRunner + template : src.taskrunner.XGBoostRunner settings : - {} + params : + objective: binary:logistic + eval_metric: logloss + max_depth: 6 + eta: 0.3 + num_parallel_tree: 1 network : defaults : plan/defaults/network.yaml diff --git a/openfl-workspace/xgb/src/dataloader.py b/openfl-workspace/xgb/src/dataloader.py index 6ae10f6b44..3d792a91c9 100644 --- a/openfl-workspace/xgb/src/dataloader.py +++ b/openfl-workspace/xgb/src/dataloader.py @@ -1,3 +1,30 @@ # Copyright (C) 2024 Intel Corporation # Licensed subject to the terms of the separately executed evaluation license agreement between -# Intel Corporation and you. \ No newline at end of file +# Intel Corporation and you. + +from openfl.federated import XGBoostDataLoader +import os +import pandas as pd + +class HiggsDataLoader(XGBoostDataLoader): + def __init__(self, data_path, **kwargs): + super().__init__(**kwargs) + X_train, y_train, X_valid, y_valid = load_Higgs( + data_path, **kwargs + ) + self.X_train = X_train + self.y_train = y_train + self.X_valid = X_valid + self.y_valid = y_valid + + +def load_Higgs(data_path, **kwargs): + train_data = pd.read_csv(os.path.join(data_path, 'train.csv'), header=None) + X_train = train_data.iloc[:, 1:].values + y_train = train_data.iloc[:, 0].values + + valid_data = pd.read_csv(os.path.join(data_path, 'valid.csv'), header=None) + X_valid = valid_data.iloc[:, 1:].values + y_valid = valid_data.iloc[:, 0].values + + return X_train, y_train, X_valid, y_valid \ No newline at end of file diff --git a/openfl-workspace/xgb/src/setup_data.py b/openfl-workspace/xgb/src/setup_data.py new file mode 100644 index 0000000000..8aaf197c7e --- /dev/null +++ b/openfl-workspace/xgb/src/setup_data.py @@ -0,0 +1,94 @@ +import sys +import os +import shutil +from logging import getLogger +from urllib.request import urlretrieve +from hashlib import sha384 +from os import path, makedirs +from tqdm import tqdm +import pandas as pd +import gzip +from sklearn.model_selection import train_test_split +import numpy as np + +logger = getLogger(__name__) + +"""HIGGS Dataset.""" + +URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz" +FILENAME = "HIGGS.csv.gz" +CSV_FILENAME = "HIGGS.csv" +CSV_SHA384 = 'b8b82e11a78b81601381420878ad42ba557291f394a88dc5293e4077c8363c87429639b120e299a2a9939c1f943b6a63' +DEFAULT_PATH = path.join(path.expanduser('~'), '.openfl', 'data') + +pbar = tqdm(total=None) + +def report_hook(count, block_size, total_size): + """Update progressbar.""" + if pbar.total is None and total_size: + pbar.total = total_size + progress_bytes = count * block_size + pbar.update(progress_bytes - pbar.n) + +def verify_sha384(file_path, expected_hash): + """Verify the SHA-384 hash of a file.""" + sha384_hash = sha384() + with open(file_path, 'rb') as f: + for byte_block in iter(lambda: f.read(4096), b""): + sha384_hash.update(byte_block) + computed_hash = sha384_hash.hexdigest() + if computed_hash != expected_hash: + raise ValueError(f"SHA-384 hash mismatch: expected {expected_hash}, got {computed_hash}") + print(f"SHA-384 hash verified: {computed_hash}") + +def setup_data(root: str = DEFAULT_PATH, **kwargs): + """Initialize.""" + makedirs(root, exist_ok=True) + filepath = path.join(root, FILENAME) + csv_filepath = path.join(root, CSV_FILENAME) + if not path.exists(filepath): + urlretrieve(URL, filepath, report_hook) # nosec + verify_sha384(filepath, CSV_SHA384) + # Extract the CSV file from the gzip file + with gzip.open(filepath, 'rb') as f_in: + with open(csv_filepath, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + +def main(): + if len(sys.argv) < 2: + raise ValueError("Provide the number of collaborators") + src = 'higgs_data' + if os.path.exists(src): + shutil.rmtree(src) + setup_data(src) + collaborators = int(sys.argv[1]) + print("Creating splits for {} collaborators".format(collaborators)) + + # Load the dataset + higgs_data = pd.read_csv(path.join(src, CSV_FILENAME), header=None, nrows=1000000) + + # Split the dataset into features and labels + X = higgs_data.iloc[:, 1:].values + y = higgs_data.iloc[:, 0].values + + # Split the dataset into training and testing sets + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + + # Combine X and y for train and test sets + train_data = pd.DataFrame(data=np.column_stack((y_train, X_train))) + test_data = pd.DataFrame(data=np.column_stack((y_test, X_test))) + + # Split the training data into parts for each collaborator + for i in range(collaborators): + dst = f'data/{i+1}' + makedirs(dst, exist_ok=True) + + # Split the training data for the current collaborator + split_train_data = train_data.iloc[i::collaborators] + split_train_data.to_csv(path.join(dst, 'train.csv'), index=False, header=False) + + # Copy the test data for the current collaborator + test_data.to_csv(path.join(dst, 'valid.csv'), index=False, header=False) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/openfl-workspace/xgb/src/taskrunner.py b/openfl-workspace/xgb/src/taskrunner.py index 6ae10f6b44..1899b11da9 100644 --- a/openfl-workspace/xgb/src/taskrunner.py +++ b/openfl-workspace/xgb/src/taskrunner.py @@ -1,3 +1,56 @@ # Copyright (C) 2024 Intel Corporation # Licensed subject to the terms of the separately executed evaluation license agreement between -# Intel Corporation and you. \ No newline at end of file +# Intel Corporation and you. + +"""You may copy this file as the starting point of your own model.""" +import numpy as np +import xgboost as xgb + +from openfl.federated import XGBoostTaskRunner +from openfl.utilities import Metric +from sklearn.metrics import accuracy_score + + +class XGBoostRunner(XGBoostTaskRunner): + """ + Simple CNN for classification. + + PyTorchTaskRunner inherits from nn.module, so you can define your model + in the same way that you would for PyTorch + """ + + def __init__(self, params=None, num_rounds=1, **kwargs): + """Initialize. + + Args: + **kwargs: Additional arguments to pass to the function + + """ + super().__init__(**kwargs) + + self.bst = None + self.params = params + self.num_rounds = num_rounds + + def train_(self, train_dataloader) -> Metric: + """Train model.""" + dtrain = train_dataloader['dmatrix'] + evals = [(dtrain, 'train')] + evals_result = {} + + self.bst = xgb.train(self.params, dtrain, self.num_rounds, xgb_model=self.bst, + evals=evals, evals_result=evals_result, verbose_eval=False) + + loss = evals_result['train']['logloss'][-1] + return Metric(name=self.params['eval_metric'], value=np.array(loss)) + + def validate_(self, validation_dataloader) -> Metric: + """Validate model.""" + + dtest = validation_dataloader['dmatrix'] + y_test = validation_dataloader['labels'] + preds = self.bst.predict(dtest) + y_pred_binary = np.where(preds > 0.5, 1, 0) + acc = accuracy_score(y_test, y_pred_binary) + + return Metric(name="accuracy", value=np.array(acc)) diff --git a/openfl/federated/__init__.py b/openfl/federated/__init__.py index 07c0ef8e6e..ea24e0ddfa 100644 --- a/openfl/federated/__init__.py +++ b/openfl/federated/__init__.py @@ -20,6 +20,11 @@ from openfl.federated.data import PyTorchDataLoader from openfl.federated.task import FederatedModel # NOQA from openfl.federated.task import PyTorchTaskRunner +if importlib.util.find_spec("xgboost") is not None: + from openfl.federated.data import FederatedDataSet # NOQA + from openfl.federated.data import XGBoostDataLoader + from openfl.federated.task import FederatedModel # NOQA + from openfl.federated.task import XGBoostTaskRunner __all__ = [ "Plan", diff --git a/openfl/federated/data/__init__.py b/openfl/federated/data/__init__.py index b61d6d24a3..e09ff26cd5 100644 --- a/openfl/federated/data/__init__.py +++ b/openfl/federated/data/__init__.py @@ -23,3 +23,8 @@ if importlib.util.find_spec("torch") is not None: from openfl.federated.data.federated_data import FederatedDataSet # NOQA from openfl.federated.data.loader_pt import PyTorchDataLoader # NOQA + +if importlib.util.find_spec("xgboost") is not None: + from openfl.federated.data.federated_data import FederatedDataSet # NOQA + from openfl.federated.data.loader_xgb import XGBoostDataLoader # NOQA + diff --git a/openfl/federated/data/loader_xgb.py b/openfl/federated/data/loader_xgb.py index f43244f310..99a1a33ec5 100644 --- a/openfl/federated/data/loader_xgb.py +++ b/openfl/federated/data/loader_xgb.py @@ -14,7 +14,7 @@ class XGBoostDataLoader: random_seed (int, optional): Random seed for data shuffling. """ - def __init__(self, batch_size, random_seed=None, **kwargs): + def __init__(self, batch_size=None, random_seed=None, **kwargs): """Initializes the XGBoostDataLoader object with the batch size, random seed, and any additional arguments. @@ -155,7 +155,10 @@ def get_train_dmatrix(self): Returns: xgb.DMatrix: The DMatrix object for the training data. """ - return self.get_dmatrix(self.X_train, self.y_train) + return { + 'dmatrix': self.get_dmatrix(self.X_train, self.y_train), + 'labels': self.y_train + } def get_valid_dmatrix(self): """Returns the DMatrix for the validation data. @@ -163,4 +166,7 @@ def get_valid_dmatrix(self): Returns: xgb.DMatrix: The DMatrix object for the validation data. """ - return self.get_dmatrix(self.X_valid, self.y_valid) \ No newline at end of file + return { + 'dmatrix': self.get_dmatrix(self.X_valid, self.y_valid), + 'labels': self.y_valid + } diff --git a/openfl/federated/task/__init__.py b/openfl/federated/task/__init__.py index cc5bb9429b..33058d8220 100644 --- a/openfl/federated/task/__init__.py +++ b/openfl/federated/task/__init__.py @@ -22,3 +22,6 @@ if importlib.util.find_spec("torch") is not None: from openfl.federated.task.fl_model import FederatedModel # NOQA from openfl.federated.task.runner_pt import PyTorchTaskRunner # NOQA +if importlib.util.find_spec("xgboost") is not None: + from openfl.federated.task.fl_model import FederatedModel # NOQA + from openfl.federated.task.runner_xgb import XGBoostTaskRunner # NOQA \ No newline at end of file diff --git a/openfl/federated/task/runner_xgb.py b/openfl/federated/task/runner_xgb.py index b2d48322a0..c678c605e5 100644 --- a/openfl/federated/task/runner_xgb.py +++ b/openfl/federated/task/runner_xgb.py @@ -4,8 +4,8 @@ """XGBoostTaskRunner module.""" -from copy import deepcopy -from typing import Iterator, Tuple +# from copy import deepcopy +# from typing import Iterator, Tuple import numpy as np import json @@ -15,12 +15,10 @@ from openfl.utilities.split import split_tensor_dict_for_holdouts import xgboost as xgb -from openfl.utilities import LocalTensor import json -from sklearn.datasets import fetch_california_housing -from sklearn.model_selection import train_test_split -from sklearn.metrics import r2_score +from sklearn.metrics import accuracy_score +import base64 class XGBoostTaskRunner(TaskRunner): def __init__(self, **kwargs): @@ -29,22 +27,21 @@ def __init__(self, **kwargs): Args: **kwargs: Additional parameters to pass to the functions. """ - super().__init__() - TaskRunner.__init__(self, **kwargs) - + super().__init__(**kwargs) # This is a map of all the required tensors for each of the public # functions in XGBoostTaskRunner - self.bst = None # TODO + # self.bst = None # TODO self.global_model = None # TODO - self.params = kwargs['params'] # TODO - self.num_rounds = kwargs['num_rounds'] # TODO + # self.params = kwargs['params'] # TODO + # self.num_rounds = kwargs['num_rounds'] # TODO self.required_tensorkeys_for_function = {} self.training_round_completed = False def rebuild_model(self, input_tensor_dict): - if input_tensor_dict is not None: - self.global_model = bytearray(input_tensor_dict) + if input_tensor_dict['local_tree'].size != 0: # check if it is empty (i.e. no model to build) + import pdb; pdb.set_trace() # need to check to make sure model is convertible + self.global_model = bytearray(input_tensor_dict['local_tree'][:-2]) #TODO self.bst = xgb.Booster() self.bst.load_model(self.global_model) @@ -67,7 +64,7 @@ def validate_task(self, col_name, round_num, input_tensor_dict, **kwargs): """ # during agg validation, self.bst will still be None. during local validation, it will have a value - no need to rebuild # if self.bst is still None after rebuilding, then there was no initial global model, so set metric to average - loader = self.data_loader.get_valid_loader() + loader = self.data_loader.get_valid_dmatrix() # if round_num != 0: # self.global_model = bytearray(input_tensor_dict) @@ -99,7 +96,6 @@ def train_task( col_name, round_num, input_tensor_dict, - epochs=1, **kwargs, ): """Train batches task. @@ -124,7 +120,7 @@ def train_task( # if round_num != 0: # self.global_model = bytearray(input_tensor_dict) self.rebuild_model(input_tensor_dict) - loader = self.data_loader.get_train_loader() + loader = self.data_loader.get_train_dmatrix() metric = self.train_(loader) # Output metric tensors (scalar) origin = col_name @@ -134,7 +130,7 @@ def train_task( } # output model tensors (Doesn't include TensorKey) - output_model_dict = self.get_tensor_dict(with_opt_vars=True) + output_model_dict = self.get_tensor_dict() global_model_dict, local_model_dict = split_tensor_dict_for_holdouts( self.logger, output_model_dict, **self.tensor_dict_split_fn_kwargs ) @@ -143,7 +139,7 @@ def train_task( global_tensorkey_model_dict = { TensorKey(tensor_name, origin, round_num, False, tags): nparray for tensor_name, nparray in global_model_dict.items() - } + } # Create tensorkeys that should stay local local_tensorkey_model_dict = { TensorKey(tensor_name, origin, round_num, False, tags): nparray @@ -177,16 +173,18 @@ def train_task( # involve doing a single epoch of training on random data to get the # optimizer names, and then throwing away the model. if self.opt_treatment == "CONTINUE_GLOBAL": - self.initialize_tensorkeys_for_functions(with_opt_vars=True) + self.initialize_tensorkeys_for_functions() # This will signal that the optimizer values are now present, # and can be loaded when the model is rebuilt self.training_round_completed = True # Return global_tensor_dict, local_tensor_dict + # import pdb; pdb.set_trace() + #TODO it is still decodable from here with .tobytes().decode('utf-8') return global_tensor_dict, local_tensor_dict - def get_tensor_dict(self): + def get_tensor_dict(self, with_opt_vars=False): if self.global_model is None: global_model_booster_dict = None num_global_trees = 0 @@ -194,18 +192,49 @@ def get_tensor_dict(self): global_model_booster_dict = json.loads(bytearray(self.global_model)) num_global_trees = int(global_model_booster_dict["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) - booster_array = self.bst.save_raw('json').decode('utf-8') - booster_dict = json.loads(booster_array) - num_total_trees = int(booster_dict["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) - - # Calculate the number of trees added in the latest training - num_latest_trees = num_total_trees - num_global_trees - + if self.bst is None: + combined_array = np.array([], dtype=np.float32) + # return { + # 'local_tree': np.array([0, 0], dtype=np.float32), + # } + # return { + # 'local_tree': np.array([], dtype=np.float32), + # 'num_global_trees': np.array(0, dtype=np.float32), + # 'num_latest_trees': np.array(0, dtype=np.float32), + # } + + else: + booster_array = self.bst.save_raw('json').decode('utf-8') + booster_dict = json.loads(booster_array) + num_total_trees = int(booster_dict["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) + + # Calculate the number of trees added in the latest training + num_latest_trees = num_total_trees - num_global_trees + + # Convert booster_array to np.array + # booster_np_array = np.frombuffer(booster_array.encode('utf-8'), dtype=np.uint8) + + # TODO, seems inefficient + booster_bytes = booster_array.encode('utf-8') + booster_base64 = base64.b64encode(booster_bytes).decode('utf-8') + + # Convert base64 string to np.float32 array + booster_float32_array = np.frombuffer(booster_base64.encode('utf-8'), dtype=np.uint8).view(np.float32) + + # Create a combined array with booster_float32_array, num_global_trees, and num_latest_trees + combined_array = np.concatenate(( + booster_float32_array, + np.array([num_global_trees, num_latest_trees], dtype=np.float32) + )) return { - 'local_tree': booster_array, - 'num_global_trees': int(num_global_trees), - 'num_latest_trees': int(num_latest_trees) + 'local_tree': combined_array } + + # return { + # 'local_tree': booster_float32_array, #booster_np_array, + # 'num_global_trees': np.array(num_global_trees, dtype=np.float32), + # 'num_latest_trees': np.array(num_latest_trees, dtype=np.float32) + # } def get_required_tensorkeys_for_function(self, func_name, **kwargs): """Get the required tensors for specified function that could be called @@ -321,21 +350,23 @@ def initialize_tensorkeys_for_functions(self, with_opt_vars=False): def train_(self, train_dataloader) -> Metric: """Train model.""" - dtrain = train_dataloader + dtrain = train_dataloader['dmatrix'] evals = [(dtrain, 'train')] evals_result = {} self.bst = xgb.train(self.params, dtrain, self.num_rounds, xgb_model=self.bst, evals=evals, evals_result=evals_result, verbose_eval=False) - loss = evals_result['train']['rmse'][-1] + loss = evals_result['train']['logloss'][-1] return Metric(name=self.loss_fn.__name__, value=np.array(loss)) def validate_(self, validation_dataloader) -> Metric: """Validate model.""" - dtest, y_test = validation_dataloader + dtest = validation_dataloader['dmatrix'] + y_test = validation_dataloader['labels'] preds = self.bst.predict(dtest) - r2 = r2_score(y_test, preds) + y_pred_binary = np.where(preds > 0.5, 1, 0) + acc = accuracy_score(y_test, y_pred_binary) - return Metric(name="accuracy", value=np.array(r2)) + return Metric(name="accuracy", value=np.array(acc)) diff --git a/openfl/interface/aggregation_functions/__init__.py b/openfl/interface/aggregation_functions/__init__.py index 39132eb9f6..1ddb7d0f25 100644 --- a/openfl/interface/aggregation_functions/__init__.py +++ b/openfl/interface/aggregation_functions/__init__.py @@ -12,3 +12,4 @@ from openfl.interface.aggregation_functions.median import Median from openfl.interface.aggregation_functions.weighted_average import WeightedAverage from openfl.interface.aggregation_functions.yogi_adaptive_aggregation import YogiAdaptiveAggregation +from openfl.interface.aggregation_functions.fed_bagging import FedBaggingXGBoost diff --git a/openfl/interface/aggregation_functions/fed_bagging.py b/openfl/interface/aggregation_functions/fed_bagging.py index 1f2cb1ca40..b0a60eff2a 100644 --- a/openfl/interface/aggregation_functions/fed_bagging.py +++ b/openfl/interface/aggregation_functions/fed_bagging.py @@ -6,7 +6,18 @@ import json from openfl.interface.aggregation_functions.core import AggregationFunction +import numpy as np +import base64 +def convert_back_to_json(booster_float32_array): + # Convert np.float32 array back to base64 string + booster_uint8_array = booster_float32_array.view(np.uint8) + booster_base64 = booster_uint8_array.tobytes().decode('utf-8') + + # Decode base64 string back to original JSON string + booster_bytes = base64.b64decode(booster_base64) + booster_array = booster_bytes.decode('utf-8') + return booster_array def verify_global_model(global_model, local_model, num_global_trees): for i in range(num_global_trees): @@ -56,29 +67,39 @@ def call(self, local_tensors, *_): global_model = None for local_tensor in local_tensors: + import pdb; pdb.set_trace() + local_tree_np_array = local_tensor.tensor[:-2] + # local_tree_np_array = local_tensor.tensor['local_tree'] + local_tree_json_string = convert_back_to_json(local_tree_np_array) + if global_model is None: - global_model = json.loads(local_tensor.tensor['local_tree']) + # the first tree becomes the global model to append to + global_model = json.loads(local_tree_json_string) else: - local_model = json.loads(local_tensor.tensor['local_tree']) - - # Assertion to check if the original trees in the local model match the global model trees - num_global_trees = local_tensor.tensor['num_global_trees'] - verify_global_model(global_model, local_model, num_global_trees) - - num_global_trees = int(global_model["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) - num_latest_trees = local_tensor.tensor['num_latest_trees'] - local_trees = local_model['learner']['gradient_booster']['model']['trees'][-num_latest_trees:] + # append subsequent trees + local_model = json.loads(local_tree_json_string) + + # Assertion to check if the original trees in the local model match the global model trees + num_global_trees = local_tensor.tensor[-2] + # num_global_trees = local_tensor.tensor['num_global_trees'] + verify_global_model(global_model, local_model, num_global_trees) + + num_global_trees = int(global_model["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) + num_latest_trees = local_tensor.tensor[-1] + # num_latest_trees = local_tensor.tensor['num_latest_trees'] + local_trees = local_model['learner']['gradient_booster']['model']['trees'][-num_latest_trees:] - global_model["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"] = str( - num_global_trees + num_latest_trees - ) - global_model["learner"]["gradient_booster"]["model"]["iteration_indptr"].append( - num_global_trees + num_latest_trees - ) + global_model["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"] = str( + num_global_trees + num_latest_trees + ) + global_model["learner"]["gradient_booster"]["model"]["iteration_indptr"].append( + num_global_trees + num_latest_trees + ) - for new_tree in range(num_latest_trees): - local_trees[new_tree]["id"] = num_global_trees + new_tree - global_model["learner"]["gradient_booster"]["model"]["trees"].append(local_trees[new_tree]) - global_model["learner"]["gradient_booster"]["model"]["tree_info"].append(0) + for new_tree in range(num_latest_trees): + local_trees[new_tree]["id"] = num_global_trees + new_tree + global_model["learner"]["gradient_booster"]["model"]["trees"].append(local_trees[new_tree]) + global_model["learner"]["gradient_booster"]["model"]["tree_info"].append(0) - return bytearray(json.dumps(global_model), "utf-8") + # TODO: this will probably be problematic, make sure that the conversion is working + return bytearray(json.dumps(global_model, default=int), "utf-8") diff --git a/openfl/interface/plan.py b/openfl/interface/plan.py index f4c91faed0..d235ad3070 100644 --- a/openfl/interface/plan.py +++ b/openfl/interface/plan.py @@ -172,7 +172,6 @@ def initialize( ) data_loader = get_dataloader(plan, prefer_minimal=True, input_shape=input_shape) - task_runner = plan.get_task_runner(data_loader) tensor_pipe = plan.get_tensor_pipe() From ddece36a7e2e177a5dbb84acbc65faf9c6c8e80b Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 14 Nov 2024 14:00:46 -0800 Subject: [PATCH 06/40] further enabling work Signed-off-by: kta-intel --- .../workspace/plan/defaults/aggregator.yaml | 5 +- openfl-workspace/xgb/plan/plan.yaml | 5 +- openfl/component/aggregator/aggregator.py | 8 +- openfl/federated/task/runner_xgb.py | 139 ++++++++++------ .../aggregation_functions/fed_bagging.py | 155 ++++++++++++------ 5 files changed, 208 insertions(+), 104 deletions(-) diff --git a/openfl-workspace/workspace/plan/defaults/aggregator.yaml b/openfl-workspace/workspace/plan/defaults/aggregator.yaml index 8b32cc986d..aac5a27f39 100644 --- a/openfl-workspace/workspace/plan/defaults/aggregator.yaml +++ b/openfl-workspace/workspace/plan/defaults/aggregator.yaml @@ -1,4 +1,5 @@ template : openfl.component.Aggregator settings : - db_store_rounds : 2 - write_logs : true + db_store_rounds : 2 + write_logs : true + delta_updates : true diff --git a/openfl-workspace/xgb/plan/plan.yaml b/openfl-workspace/xgb/plan/plan.yaml index 53de425499..21bb44eb4c 100644 --- a/openfl-workspace/xgb/plan/plan.yaml +++ b/openfl-workspace/xgb/plan/plan.yaml @@ -8,8 +8,9 @@ aggregator : init_state_path : save/init.pbuf best_state_path : save/best.pbuf last_state_path : save/last.pbuf - rounds_to_train : 2 - write_logs : false + rounds_to_train : 10 + write_logs : false + delta_updates : false collaborator : defaults : plan/defaults/collaborator.yaml diff --git a/openfl/component/aggregator/aggregator.py b/openfl/component/aggregator/aggregator.py index 0ec816276b..b3be9f7f63 100644 --- a/openfl/component/aggregator/aggregator.py +++ b/openfl/component/aggregator/aggregator.py @@ -68,6 +68,7 @@ def __init__( init_state_path, best_state_path, last_state_path, + delta_updates, assigner, straggler_handling_policy=None, rounds_to_train=256, @@ -186,6 +187,8 @@ def __init__( # Initialize a lock for thread safety self.lock = Lock() + self.delta_updates = delta_updates + def _load_initial_tensors(self): """Load all of the tensors required to begin federated learning. @@ -801,7 +804,7 @@ def _prepare_trained(self, tensor_name, origin, round_number, report, agg_result # Create delta and save it in TensorDB base_model_tk = TensorKey(tensor_name, origin, round_number, report, ("model",)) base_model_nparray = self.tensor_db.get_tensor_from_cache(base_model_tk) - if base_model_nparray is not None: + if base_model_nparray is not None and self.delta_updates: delta_tk, delta_nparray = self.tensor_codec.generate_delta( agg_tag_tk, agg_results, base_model_nparray ) @@ -830,7 +833,7 @@ def _prepare_trained(self, tensor_name, origin, round_number, report, agg_result self.tensor_db.cache_tensor({decompressed_delta_tk: decompressed_delta_nparray}) # Apply delta (unless delta couldn't be created) - if base_model_nparray is not None: + if base_model_nparray is not None and self.delta_updates: self.logger.debug("Applying delta for layer %s", decompressed_delta_tk[0]) new_model_tk, new_model_nparray = self.tensor_codec.apply_delta( decompressed_delta_tk, @@ -860,6 +863,7 @@ def _prepare_trained(self, tensor_name, origin, round_number, report, agg_result new_model_report, ("model",), ) + # import pdb; pdb.set_trace() # Finally, cache the updated model tensor self.tensor_db.cache_tensor({final_model_tk: new_model_nparray}) diff --git a/openfl/federated/task/runner_xgb.py b/openfl/federated/task/runner_xgb.py index c678c605e5..dd6035fbbe 100644 --- a/openfl/federated/task/runner_xgb.py +++ b/openfl/federated/task/runner_xgb.py @@ -20,6 +20,16 @@ import base64 +def convert_back_to_json(booster_float32_array): + # Convert np.float32 array back to base64 string + booster_uint8_array = booster_float32_array.view(np.uint8) + booster_base64 = booster_uint8_array.tobytes().decode('utf-8') + + # Decode base64 string back to original JSON string + booster_bytes = base64.b64decode(booster_base64) + booster_array = booster_bytes.decode('utf-8') + return booster_array + class XGBoostTaskRunner(TaskRunner): def __init__(self, **kwargs): """Initializes the XGBoostTaskRunner object. @@ -39,11 +49,14 @@ def __init__(self, **kwargs): self.training_round_completed = False def rebuild_model(self, input_tensor_dict): - if input_tensor_dict['local_tree'].size != 0: # check if it is empty (i.e. no model to build) - import pdb; pdb.set_trace() # need to check to make sure model is convertible - self.global_model = bytearray(input_tensor_dict['local_tree'][:-2]) #TODO + if (isinstance(input_tensor_dict['local_tree'], np.ndarray) and input_tensor_dict['local_tree'].size != 0) \ + or (not isinstance(input_tensor_dict['local_tree'], np.ndarray) and input_tensor_dict['local_tree'] is not None): + # if input_tensor_dict['local_tree'].size != 0: # check if it is empty (i.e. no model to build) + self.global_model = input_tensor_dict['local_tree'].view(np.uint8).tobytes().decode('utf-8') + self.global_model = base64.b64decode(self.global_model) + # self.global_model = bytearray(input_tensor_dict['local_tree']) #TODO self.bst = xgb.Booster() - self.bst.load_model(self.global_model) + self.bst.load_model(bytearray(self.global_model)) def validate_task(self, col_name, round_num, input_tensor_dict, **kwargs): """Validate Task. @@ -182,59 +195,93 @@ def train_task( # Return global_tensor_dict, local_tensor_dict # import pdb; pdb.set_trace() #TODO it is still decodable from here with .tobytes().decode('utf-8') - return global_tensor_dict, local_tensor_dict - + return global_tensor_dict, local_tensor_dict + def get_tensor_dict(self, with_opt_vars=False): - if self.global_model is None: - global_model_booster_dict = None - num_global_trees = 0 - else: - global_model_booster_dict = json.loads(bytearray(self.global_model)) - num_global_trees = int(global_model_booster_dict["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) + if self.bst is None: + # For initializing tensor dict + return {'local_tree': np.array([], dtype=np.float32)} - if self.bst is None: - combined_array = np.array([], dtype=np.float32) - # return { - # 'local_tree': np.array([0, 0], dtype=np.float32), - # } - # return { - # 'local_tree': np.array([], dtype=np.float32), - # 'num_global_trees': np.array(0, dtype=np.float32), - # 'num_latest_trees': np.array(0, dtype=np.float32), - # } - - else: booster_array = self.bst.save_raw('json').decode('utf-8') booster_dict = json.loads(booster_array) + + if (isinstance(self.global_model, np.ndarray) and self.global_model.size == 0) or self.global_model is None: + booster_bytes = booster_array.encode('utf-8') + booster_base64 = base64.b64encode(booster_bytes).decode('utf-8') + + # Convert base64 string to np.float32 array + booster_float32_array = np.frombuffer(booster_base64.encode('utf-8'), dtype=np.uint8).view(np.float32) + + return {'local_tree': booster_float32_array} + global_model_booster_dict = json.loads(self.global_model) + num_global_trees = int(global_model_booster_dict["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) num_total_trees = int(booster_dict["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) # Calculate the number of trees added in the latest training num_latest_trees = num_total_trees - num_global_trees + latest_trees = booster_dict['learner']['gradient_booster']['model']['trees'][-num_latest_trees:] + # Convert latest_trees to a JSON string + latest_trees_json = json.dumps(latest_trees) + + # Convert JSON string to np.float32 array + latest_trees_bytes = latest_trees_json.encode('utf-8') + latest_trees_base64 = base64.b64encode(latest_trees_bytes).decode('utf-8') + latest_trees_float32_array = np.frombuffer(latest_trees_base64.encode('utf-8'), dtype=np.uint8).view(np.float32) + + return {'local_tree': latest_trees_float32_array} + + + # def get_tensor_dict(self, with_opt_vars=False): + # if self.global_model is None: + # global_model_booster_dict = None + # num_global_trees = 0 + # else: + # global_model_booster_dict = json.loads(bytearray(self.global_model)) + # num_global_trees = int(global_model_booster_dict["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) + + # if self.bst is None: + # combined_array = np.array([], dtype=np.float32) + # # return { + # # 'local_tree': np.array([0, 0], dtype=np.float32), + # # } + # # return { + # # 'local_tree': np.array([], dtype=np.float32), + # # 'num_global_trees': np.array(0, dtype=np.float32), + # # 'num_latest_trees': np.array(0, dtype=np.float32), + # # } + + # else: + # booster_array = self.bst.save_raw('json').decode('utf-8') + # booster_dict = json.loads(booster_array) + # num_total_trees = int(booster_dict["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) + + # # Calculate the number of trees added in the latest training + # num_latest_trees = num_total_trees - num_global_trees - # Convert booster_array to np.array - # booster_np_array = np.frombuffer(booster_array.encode('utf-8'), dtype=np.uint8) + # # Convert booster_array to np.array + # # booster_np_array = np.frombuffer(booster_array.encode('utf-8'), dtype=np.uint8) - # TODO, seems inefficient - booster_bytes = booster_array.encode('utf-8') - booster_base64 = base64.b64encode(booster_bytes).decode('utf-8') - - # Convert base64 string to np.float32 array - booster_float32_array = np.frombuffer(booster_base64.encode('utf-8'), dtype=np.uint8).view(np.float32) - - # Create a combined array with booster_float32_array, num_global_trees, and num_latest_trees - combined_array = np.concatenate(( - booster_float32_array, - np.array([num_global_trees, num_latest_trees], dtype=np.float32) - )) - return { - 'local_tree': combined_array - } + # # TODO, seems inefficient + # booster_bytes = booster_array.encode('utf-8') + # booster_base64 = base64.b64encode(booster_bytes).decode('utf-8') + + # # Convert base64 string to np.float32 array + # booster_float32_array = np.frombuffer(booster_base64.encode('utf-8'), dtype=np.uint8).view(np.float32) + + # # Create a combined array with booster_float32_array, num_global_trees, and num_latest_trees + # combined_array = np.concatenate(( + # booster_float32_array, + # np.array([num_global_trees, num_latest_trees], dtype=np.float32) + # )) + # return { + # 'local_tree': combined_array + # } - # return { - # 'local_tree': booster_float32_array, #booster_np_array, - # 'num_global_trees': np.array(num_global_trees, dtype=np.float32), - # 'num_latest_trees': np.array(num_latest_trees, dtype=np.float32) - # } + # # return { + # # 'local_tree': booster_float32_array, #booster_np_array, + # # 'num_global_trees': np.array(num_global_trees, dtype=np.float32), + # # 'num_latest_trees': np.array(num_latest_trees, dtype=np.float32) + # # } def get_required_tensorkeys_for_function(self, func_name, **kwargs): """Get the required tensors for specified function that could be called diff --git a/openfl/interface/aggregation_functions/fed_bagging.py b/openfl/interface/aggregation_functions/fed_bagging.py index b0a60eff2a..1fe10f9a5b 100644 --- a/openfl/interface/aggregation_functions/fed_bagging.py +++ b/openfl/interface/aggregation_functions/fed_bagging.py @@ -6,32 +6,50 @@ import json from openfl.interface.aggregation_functions.core import AggregationFunction +from openfl.federated.task.runner_xgb import convert_back_to_json import numpy as np import base64 -def convert_back_to_json(booster_float32_array): - # Convert np.float32 array back to base64 string - booster_uint8_array = booster_float32_array.view(np.uint8) - booster_base64 = booster_uint8_array.tobytes().decode('utf-8') +def get_global_model(iterator, target_round): + for item in iterator: + # Items tagged with ('model',) are the global model of that round + if 'tags' in item and item['tags'] == ('model',) and item['round'] == target_round: + return item['nparray'] + raise ValueError(f"No item found with tag 'model' and round {target_round}") - # Decode base64 string back to original JSON string - booster_bytes = base64.b64decode(booster_base64) - booster_array = booster_bytes.decode('utf-8') - return booster_array +# def convert_back_to_json(booster_float32_array): +# # Convert np.float32 array back to base64 string +# booster_uint8_array = booster_float32_array.view(np.uint8) +# booster_base64 = booster_uint8_array.tobytes().decode('utf-8') -def verify_global_model(global_model, local_model, num_global_trees): - for i in range(num_global_trees): - global_tree = global_model['learner']['gradient_booster']['model']['trees'][i] - global_tree_local = local_model['learner']['gradient_booster']['model']['trees'][i] +# # Decode base64 string back to original JSON string +# booster_bytes = base64.b64decode(booster_base64) +# booster_array = booster_bytes.decode('utf-8') +# return booster_array - assert global_tree == global_tree_local, \ - "Mismatch found in trees. Models are not from the same global model." +def append_trees(global_model, local_trees): + + num_global_trees = int(global_model["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) + num_local_trees = len(local_trees) + + global_model["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"] = str( + num_global_trees + num_local_trees + ) + global_model["learner"]["gradient_booster"]["model"]["iteration_indptr"].append( + num_global_trees + num_local_trees + ) + for new_tree in range(num_local_trees): + local_trees[new_tree]["id"] = num_global_trees + new_tree + global_model["learner"]["gradient_booster"]["model"]["trees"].append(local_trees[new_tree]) + global_model["learner"]["gradient_booster"]["model"]["tree_info"].append(0) + + return global_model class FedBaggingXGBoost(AggregationFunction): """Federated Boostrap Aggregation for XGBoost.""" - def call(self, local_tensors, *_): + def call(self, local_tensors, db_iterator, tensor_name, fl_round, *_): """Aggregate tensors. Args: @@ -64,42 +82,75 @@ def call(self, local_tensors, *_): Returns: bytearray: aggregated tensor """ - global_model = None + # global_model = None + global_model = get_global_model(db_iterator, fl_round) - for local_tensor in local_tensors: - import pdb; pdb.set_trace() - local_tree_np_array = local_tensor.tensor[:-2] - # local_tree_np_array = local_tensor.tensor['local_tree'] - local_tree_json_string = convert_back_to_json(local_tree_np_array) - - if global_model is None: - # the first tree becomes the global model to append to - global_model = json.loads(local_tree_json_string) - else: - # append subsequent trees - local_model = json.loads(local_tree_json_string) - - # Assertion to check if the original trees in the local model match the global model trees - num_global_trees = local_tensor.tensor[-2] - # num_global_trees = local_tensor.tensor['num_global_trees'] - verify_global_model(global_model, local_model, num_global_trees) + if (isinstance(global_model, np.ndarray) and global_model.size == 0) or global_model is None: + for local_tensor in local_tensors: + local_tree_json = json.loads(convert_back_to_json(local_tensor.tensor)) + + if (isinstance(global_model, np.ndarray) and global_model.size == 0) or global_model is None: + # the first tree becomes the global model to append to + global_model = local_tree_json + else: + # append subsequent trees + local_model = local_tree_json + local_trees = local_model['learner']['gradient_booster']['model']['trees'] + global_model = append_trees(global_model, local_trees) + else: + global_model = json.loads(convert_back_to_json(global_model)) + + for local_tensor in local_tensors: + local_trees = json.loads(convert_back_to_json(local_tensor.tensor)) + global_model = append_trees(global_model, local_trees) + + ## Ensures that model is recoverable. TODO put in function + # Convert latest_trees to a JSON string + global_model_json = json.dumps(global_model) + + # Convert JSON string to np.float32 array + global_model_bytes = global_model_json.encode('utf-8') + global_model_base64 = base64.b64encode(global_model_bytes).decode('utf-8') + global_model_float32_array = np.frombuffer(global_model_base64.encode('utf-8'), dtype=np.uint8).view(np.float32) + + return global_model_float32_array + + # # global_model = None + # import pdb; pdb.set_trace() + # global_model = get_global_model(db_iterator, fl_round) + + # for local_tensor in local_tensors: + # local_tree_np_array = local_tensor.tensor[:-2] + # # local_tree_np_array = local_tensor.tensor['local_tree'] + # local_tree_json = convert_back_to_json(local_tree_np_array) - num_global_trees = int(global_model["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) - num_latest_trees = local_tensor.tensor[-1] - # num_latest_trees = local_tensor.tensor['num_latest_trees'] - local_trees = local_model['learner']['gradient_booster']['model']['trees'][-num_latest_trees:] - - global_model["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"] = str( - num_global_trees + num_latest_trees - ) - global_model["learner"]["gradient_booster"]["model"]["iteration_indptr"].append( - num_global_trees + num_latest_trees - ) - - for new_tree in range(num_latest_trees): - local_trees[new_tree]["id"] = num_global_trees + new_tree - global_model["learner"]["gradient_booster"]["model"]["trees"].append(local_trees[new_tree]) - global_model["learner"]["gradient_booster"]["model"]["tree_info"].append(0) - - # TODO: this will probably be problematic, make sure that the conversion is working - return bytearray(json.dumps(global_model, default=int), "utf-8") + # if global_model.size == 0: + # # the first tree becomes the global model to append to + # global_model = local_tree_json + # else: + # # append subsequent trees + # local_model = local_tree_json + # # Assertion to check if the original trees in the local model match the global model trees + # num_global_trees = int(local_tensor.tensor[-2]) + # # num_global_trees = local_tensor.tensor['num_global_trees'] + # verify_global_model(global_model, local_model, num_global_trees) + + # num_global_trees = int(global_model["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) + # num_latest_trees = int(local_tensor.tensor[-1]) + # # num_latest_trees = local_tensor.tensor['num_latest_trees'] + # local_trees = local_model['learner']['gradient_booster']['model']['trees'][-num_latest_trees:] + + # global_model["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"] = str( + # num_global_trees + num_latest_trees + # ) + # global_model["learner"]["gradient_booster"]["model"]["iteration_indptr"].append( + # num_global_trees + num_latest_trees + # ) + + # for new_tree in range(num_latest_trees): + # local_trees[new_tree]["id"] = num_global_trees + new_tree + # global_model["learner"]["gradient_booster"]["model"]["trees"].append(local_trees[new_tree]) + # global_model["learner"]["gradient_booster"]["model"]["tree_info"].append(0) + + # # TODO: this will probably be problematic, make sure that the conversion is working + # return bytearray(json.dumps(global_model, default=int), "utf-8") From c7e2d76a16630df6a034d8f4333db7e00cd34f16 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 14 Nov 2024 14:18:09 -0800 Subject: [PATCH 07/40] fix first round local validation Signed-off-by: kta-intel --- openfl/federated/task/runner_xgb.py | 64 +++-------------------------- 1 file changed, 6 insertions(+), 58 deletions(-) diff --git a/openfl/federated/task/runner_xgb.py b/openfl/federated/task/runner_xgb.py index dd6035fbbe..bd8a7afa16 100644 --- a/openfl/federated/task/runner_xgb.py +++ b/openfl/federated/task/runner_xgb.py @@ -75,17 +75,17 @@ def validate_task(self, col_name, round_num, input_tensor_dict, **kwargs): local_output_dict (dict): Tensors to maintain in the local TensorDB. """ - # during agg validation, self.bst will still be None. during local validation, it will have a value - no need to rebuild - # if self.bst is still None after rebuilding, then there was no initial global model, so set metric to average loader = self.data_loader.get_valid_dmatrix() - # if round_num != 0: - # self.global_model = bytearray(input_tensor_dict) + # during agg validation, self.bst will still be None. during local validation, it will have a value - no need to rebuild if self.bst is None: self.rebuild_model(input_tensor_dict) - if round_num == 0: # if self.bst is None: - metric = Metric(name="accuracy", value=np.array(0)) # for first round, there is no global model, so set metric to 0 + # if self.bst is still None after rebuilding, then there was no initial global model, so set metric to 0 + if self.bst is None: + # for first round agg validation, there is no model so set metric to 0 + # TODO: this is not robust, especially if using a loss metric + metric = Metric(name="accuracy", value=np.array(0)) else: metric = self.validate_(loader) @@ -229,59 +229,7 @@ def get_tensor_dict(self, with_opt_vars=False): latest_trees_float32_array = np.frombuffer(latest_trees_base64.encode('utf-8'), dtype=np.uint8).view(np.float32) return {'local_tree': latest_trees_float32_array} - - # def get_tensor_dict(self, with_opt_vars=False): - # if self.global_model is None: - # global_model_booster_dict = None - # num_global_trees = 0 - # else: - # global_model_booster_dict = json.loads(bytearray(self.global_model)) - # num_global_trees = int(global_model_booster_dict["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) - - # if self.bst is None: - # combined_array = np.array([], dtype=np.float32) - # # return { - # # 'local_tree': np.array([0, 0], dtype=np.float32), - # # } - # # return { - # # 'local_tree': np.array([], dtype=np.float32), - # # 'num_global_trees': np.array(0, dtype=np.float32), - # # 'num_latest_trees': np.array(0, dtype=np.float32), - # # } - - # else: - # booster_array = self.bst.save_raw('json').decode('utf-8') - # booster_dict = json.loads(booster_array) - # num_total_trees = int(booster_dict["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) - - # # Calculate the number of trees added in the latest training - # num_latest_trees = num_total_trees - num_global_trees - - # # Convert booster_array to np.array - # # booster_np_array = np.frombuffer(booster_array.encode('utf-8'), dtype=np.uint8) - - # # TODO, seems inefficient - # booster_bytes = booster_array.encode('utf-8') - # booster_base64 = base64.b64encode(booster_bytes).decode('utf-8') - - # # Convert base64 string to np.float32 array - # booster_float32_array = np.frombuffer(booster_base64.encode('utf-8'), dtype=np.uint8).view(np.float32) - - # # Create a combined array with booster_float32_array, num_global_trees, and num_latest_trees - # combined_array = np.concatenate(( - # booster_float32_array, - # np.array([num_global_trees, num_latest_trees], dtype=np.float32) - # )) - # return { - # 'local_tree': combined_array - # } - - # # return { - # # 'local_tree': booster_float32_array, #booster_np_array, - # # 'num_global_trees': np.array(num_global_trees, dtype=np.float32), - # # 'num_latest_trees': np.array(num_latest_trees, dtype=np.float32) - # # } def get_required_tensorkeys_for_function(self, func_name, **kwargs): """Get the required tensors for specified function that could be called From 9d385a7782a51b689e6ce069693d2271e92c46a6 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Thu, 14 Nov 2024 15:22:47 -0800 Subject: [PATCH 08/40] remove need to convert to float64 Signed-off-by: kta-intel --- openfl-workspace/xgb/plan/plan.yaml | 2 +- openfl/federated/task/runner_xgb.py | 40 ++--------- .../aggregation_functions/fed_bagging.py | 66 ++----------------- 3 files changed, 15 insertions(+), 93 deletions(-) diff --git a/openfl-workspace/xgb/plan/plan.yaml b/openfl-workspace/xgb/plan/plan.yaml index 21bb44eb4c..337005a36c 100644 --- a/openfl-workspace/xgb/plan/plan.yaml +++ b/openfl-workspace/xgb/plan/plan.yaml @@ -8,7 +8,7 @@ aggregator : init_state_path : save/init.pbuf best_state_path : save/best.pbuf last_state_path : save/last.pbuf - rounds_to_train : 10 + rounds_to_train : 100 write_logs : false delta_updates : false diff --git a/openfl/federated/task/runner_xgb.py b/openfl/federated/task/runner_xgb.py index bd8a7afa16..cff8fcd662 100644 --- a/openfl/federated/task/runner_xgb.py +++ b/openfl/federated/task/runner_xgb.py @@ -18,17 +18,6 @@ import json from sklearn.metrics import accuracy_score -import base64 - -def convert_back_to_json(booster_float32_array): - # Convert np.float32 array back to base64 string - booster_uint8_array = booster_float32_array.view(np.uint8) - booster_base64 = booster_uint8_array.tobytes().decode('utf-8') - - # Decode base64 string back to original JSON string - booster_bytes = base64.b64decode(booster_base64) - booster_array = booster_bytes.decode('utf-8') - return booster_array class XGBoostTaskRunner(TaskRunner): def __init__(self, **kwargs): @@ -40,10 +29,7 @@ def __init__(self, **kwargs): super().__init__(**kwargs) # This is a map of all the required tensors for each of the public # functions in XGBoostTaskRunner - # self.bst = None # TODO self.global_model = None # TODO - # self.params = kwargs['params'] # TODO - # self.num_rounds = kwargs['num_rounds'] # TODO self.required_tensorkeys_for_function = {} self.training_round_completed = False @@ -51,12 +37,9 @@ def __init__(self, **kwargs): def rebuild_model(self, input_tensor_dict): if (isinstance(input_tensor_dict['local_tree'], np.ndarray) and input_tensor_dict['local_tree'].size != 0) \ or (not isinstance(input_tensor_dict['local_tree'], np.ndarray) and input_tensor_dict['local_tree'] is not None): - # if input_tensor_dict['local_tree'].size != 0: # check if it is empty (i.e. no model to build) - self.global_model = input_tensor_dict['local_tree'].view(np.uint8).tobytes().decode('utf-8') - self.global_model = base64.b64decode(self.global_model) - # self.global_model = bytearray(input_tensor_dict['local_tree']) #TODO + self.global_model = bytearray(input_tensor_dict['local_tree'].astype(np.uint8).tobytes()) self.bst = xgb.Booster() - self.bst.load_model(bytearray(self.global_model)) + self.bst.load_model(self.global_model) def validate_task(self, col_name, round_num, input_tensor_dict, **kwargs): """Validate Task. @@ -128,10 +111,6 @@ def train_task( local_output_dict (dict): Tensors to maintain in the local TensorDB. """ - # self.rebuild_model(round_num, input_tensor_dict) - # set to "training" mode - # if round_num != 0: - # self.global_model = bytearray(input_tensor_dict) self.rebuild_model(input_tensor_dict) loader = self.data_loader.get_train_dmatrix() metric = self.train_(loader) @@ -202,17 +181,13 @@ def get_tensor_dict(self, with_opt_vars=False): # For initializing tensor dict return {'local_tree': np.array([], dtype=np.float32)} - booster_array = self.bst.save_raw('json').decode('utf-8') + booster_array = self.bst.save_raw('json') booster_dict = json.loads(booster_array) if (isinstance(self.global_model, np.ndarray) and self.global_model.size == 0) or self.global_model is None: - booster_bytes = booster_array.encode('utf-8') - booster_base64 = base64.b64encode(booster_bytes).decode('utf-8') - - # Convert base64 string to np.float32 array - booster_float32_array = np.frombuffer(booster_base64.encode('utf-8'), dtype=np.uint8).view(np.float32) - + booster_float32_array = np.frombuffer(booster_array, dtype=np.uint8).astype(np.float32) return {'local_tree': booster_float32_array} + global_model_booster_dict = json.loads(self.global_model) num_global_trees = int(global_model_booster_dict["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) num_total_trees = int(booster_dict["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) @@ -220,13 +195,12 @@ def get_tensor_dict(self, with_opt_vars=False): # Calculate the number of trees added in the latest training num_latest_trees = num_total_trees - num_global_trees latest_trees = booster_dict['learner']['gradient_booster']['model']['trees'][-num_latest_trees:] + # Convert latest_trees to a JSON string latest_trees_json = json.dumps(latest_trees) - # Convert JSON string to np.float32 array latest_trees_bytes = latest_trees_json.encode('utf-8') - latest_trees_base64 = base64.b64encode(latest_trees_bytes).decode('utf-8') - latest_trees_float32_array = np.frombuffer(latest_trees_base64.encode('utf-8'), dtype=np.uint8).view(np.float32) + latest_trees_float32_array = np.frombuffer(latest_trees_bytes, dtype=np.uint8).astype(np.float32) return {'local_tree': latest_trees_float32_array} diff --git a/openfl/interface/aggregation_functions/fed_bagging.py b/openfl/interface/aggregation_functions/fed_bagging.py index 1fe10f9a5b..336a14dd75 100644 --- a/openfl/interface/aggregation_functions/fed_bagging.py +++ b/openfl/interface/aggregation_functions/fed_bagging.py @@ -6,9 +6,7 @@ import json from openfl.interface.aggregation_functions.core import AggregationFunction -from openfl.federated.task.runner_xgb import convert_back_to_json import numpy as np -import base64 def get_global_model(iterator, target_round): for item in iterator: @@ -17,15 +15,6 @@ def get_global_model(iterator, target_round): return item['nparray'] raise ValueError(f"No item found with tag 'model' and round {target_round}") -# def convert_back_to_json(booster_float32_array): -# # Convert np.float32 array back to base64 string -# booster_uint8_array = booster_float32_array.view(np.uint8) -# booster_base64 = booster_uint8_array.tobytes().decode('utf-8') - -# # Decode base64 string back to original JSON string -# booster_bytes = base64.b64decode(booster_base64) -# booster_array = booster_bytes.decode('utf-8') -# return booster_array def append_trees(global_model, local_trees): @@ -87,7 +76,8 @@ def call(self, local_tensors, db_iterator, tensor_name, fl_round, *_): if (isinstance(global_model, np.ndarray) and global_model.size == 0) or global_model is None: for local_tensor in local_tensors: - local_tree_json = json.loads(convert_back_to_json(local_tensor.tensor)) + local_tree_bytearray = bytearray(local_tensor.tensor.astype(np.uint8).tobytes()) + local_tree_json = json.loads(local_tree_bytearray) if (isinstance(global_model, np.ndarray) and global_model.size == 0) or global_model is None: # the first tree becomes the global model to append to @@ -98,59 +88,17 @@ def call(self, local_tensors, db_iterator, tensor_name, fl_round, *_): local_trees = local_model['learner']['gradient_booster']['model']['trees'] global_model = append_trees(global_model, local_trees) else: - global_model = json.loads(convert_back_to_json(global_model)) + global_model_bytearray = bytearray(global_model.astype(np.uint8).tobytes()) + global_model = json.loads(global_model_bytearray) for local_tensor in local_tensors: - local_trees = json.loads(convert_back_to_json(local_tensor.tensor)) + local_tree_bytearray = bytearray(local_tensor.tensor.astype(np.uint8).tobytes()) + local_trees = json.loads(local_tree_bytearray) global_model = append_trees(global_model, local_trees) ## Ensures that model is recoverable. TODO put in function # Convert latest_trees to a JSON string global_model_json = json.dumps(global_model) - - # Convert JSON string to np.float32 array global_model_bytes = global_model_json.encode('utf-8') - global_model_base64 = base64.b64encode(global_model_bytes).decode('utf-8') - global_model_float32_array = np.frombuffer(global_model_base64.encode('utf-8'), dtype=np.uint8).view(np.float32) - - return global_model_float32_array - - # # global_model = None - # import pdb; pdb.set_trace() - # global_model = get_global_model(db_iterator, fl_round) - # for local_tensor in local_tensors: - # local_tree_np_array = local_tensor.tensor[:-2] - # # local_tree_np_array = local_tensor.tensor['local_tree'] - # local_tree_json = convert_back_to_json(local_tree_np_array) - - # if global_model.size == 0: - # # the first tree becomes the global model to append to - # global_model = local_tree_json - # else: - # # append subsequent trees - # local_model = local_tree_json - # # Assertion to check if the original trees in the local model match the global model trees - # num_global_trees = int(local_tensor.tensor[-2]) - # # num_global_trees = local_tensor.tensor['num_global_trees'] - # verify_global_model(global_model, local_model, num_global_trees) - - # num_global_trees = int(global_model["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) - # num_latest_trees = int(local_tensor.tensor[-1]) - # # num_latest_trees = local_tensor.tensor['num_latest_trees'] - # local_trees = local_model['learner']['gradient_booster']['model']['trees'][-num_latest_trees:] - - # global_model["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"] = str( - # num_global_trees + num_latest_trees - # ) - # global_model["learner"]["gradient_booster"]["model"]["iteration_indptr"].append( - # num_global_trees + num_latest_trees - # ) - - # for new_tree in range(num_latest_trees): - # local_trees[new_tree]["id"] = num_global_trees + new_tree - # global_model["learner"]["gradient_booster"]["model"]["trees"].append(local_trees[new_tree]) - # global_model["learner"]["gradient_booster"]["model"]["tree_info"].append(0) - - # # TODO: this will probably be problematic, make sure that the conversion is working - # return bytearray(json.dumps(global_model, default=int), "utf-8") + return np.frombuffer(global_model_bytes, dtype=np.uint8).astype(np.float32) \ No newline at end of file From ce4b34fe1e11c3a9add6db3ce09cbae8fdc15e10 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 15 Nov 2024 10:54:27 -0800 Subject: [PATCH 09/40] fix model save Signed-off-by: kta-intel --- openfl/federated/task/runner_xgb.py | 97 +++++++++++-------- .../aggregation_functions/fed_bagging.py | 29 ++++-- 2 files changed, 81 insertions(+), 45 deletions(-) diff --git a/openfl/federated/task/runner_xgb.py b/openfl/federated/task/runner_xgb.py index cff8fcd662..bcdbe1ec16 100644 --- a/openfl/federated/task/runner_xgb.py +++ b/openfl/federated/task/runner_xgb.py @@ -27,19 +27,28 @@ def __init__(self, **kwargs): **kwargs: Additional parameters to pass to the functions. """ super().__init__(**kwargs) - # This is a map of all the required tensors for each of the public - # functions in XGBoostTaskRunner - self.global_model = None # TODO - + self.global_model = None self.required_tensorkeys_for_function = {} self.training_round_completed = False def rebuild_model(self, input_tensor_dict): + """ + Rebuilds the model using the provided input tensor dictionary. + + This method checks if the 'local_tree' key in the input tensor dictionary is either a non-empty numpy array + or a non-None value. If this condition is met, it updates the internal tensor dictionary with the provided input. + + Parameters: + input_tensor_dict (dict): A dictionary containing tensor data. It must include the key 'local_tree', which can be: + - A non-empty numpy array + - Any non-None value + + Returns: + None + """ if (isinstance(input_tensor_dict['local_tree'], np.ndarray) and input_tensor_dict['local_tree'].size != 0) \ or (not isinstance(input_tensor_dict['local_tree'], np.ndarray) and input_tensor_dict['local_tree'] is not None): - self.global_model = bytearray(input_tensor_dict['local_tree'].astype(np.uint8).tobytes()) - self.bst = xgb.Booster() - self.bst.load_model(self.global_model) + self.set_tensor_dict(input_tensor_dict) def validate_task(self, col_name, round_num, input_tensor_dict, **kwargs): """Validate Task. @@ -50,7 +59,6 @@ def validate_task(self, col_name, round_num, input_tensor_dict, **kwargs): col_name (str): Name of the collaborator. round_num (int): What round is it. input_tensor_dict (dict): Required input tensors (for model). - use_tqdm (bool): Use tqdm to print a progress bar (Default=True). **kwargs: Additional parameters. Returns: @@ -102,8 +110,6 @@ def train_task( col_name (str): Name of the collaborator. round_num (int): What round is it. input_tensor_dict (dict): Required input tensors (for model). - use_tqdm (bool): Use tqdm to print a progress bar (Default=True). - epochs (int): The number of epochs to train. **kwargs: Additional parameters. Returns: @@ -177,6 +183,21 @@ def train_task( return global_tensor_dict, local_tensor_dict def get_tensor_dict(self, with_opt_vars=False): + """ + Retrieves the tensor dictionary containing the model's tree structure. + + This method returns a dictionary with the key 'local_tree', which contains the model's tree structure as a numpy array. + If the model has not been initialized (`self.bst` is None), it returns an empty numpy array. + If the global model is not set or is empty, it returns the entire model as a numpy array. + Otherwise, it returns only the trees added in the latest training session. + + Parameters: + with_opt_vars (bool): N/A for XGBoost (Default=False). + + Returns: + dict: A dictionary with the key 'local_tree' containing the model's tree structure as a numpy array. + """ + if self.bst is None: # For initializing tensor dict return {'local_tree': np.array([], dtype=np.float32)} @@ -196,9 +217,7 @@ def get_tensor_dict(self, with_opt_vars=False): num_latest_trees = num_total_trees - num_global_trees latest_trees = booster_dict['learner']['gradient_booster']['model']['trees'][-num_latest_trees:] - # Convert latest_trees to a JSON string latest_trees_json = json.dumps(latest_trees) - # Convert JSON string to np.float32 array latest_trees_bytes = latest_trees_json.encode('utf-8') latest_trees_float32_array = np.frombuffer(latest_trees_bytes, dtype=np.uint8).astype(np.float32) @@ -289,33 +308,33 @@ def initialize_tensorkeys_for_functions(self, with_opt_vars=False): for tensor_name in local_model_dict_val ] - # def save_native( - # self, - # filepath, - # model_state_dict_key="model_state_dict", - # optimizer_state_dict_key="optimizer_state_dict", - # **kwargs, - # ): - # """Save model and optimizer states in a picked file specified by the - # filepath. model_/optimizer_state_dicts are stored in the keys provided. - # Uses pt.save(). - - # Args: - # filepath (str): Path to pickle file to be created by pt.save(). - # model_state_dict_key (str): key for model state dict in pickled - # file. - # optimizer_state_dict_key (str): key for optimizer state dict in - # picked file. - # **kwargs: Additional parameters. - - # Returns: - # None - # """ - # pickle_dict = { - # model_state_dict_key: self.state_dict(), - # optimizer_state_dict_key: self.optimizer.state_dict(), - # } - # torch.save(pickle_dict, filepath) + def set_tensor_dict(self, tensor_dict, with_opt_vars=False): + """Set the tensor dictionary. + + Args: + tensor_dict (dict): The tensor dictionary. + with_opt_vars (bool): N/A for XGBoost (Default=False). + """ + # The with_opt_vars argument is not used in this method + self.global_model = bytearray(tensor_dict['local_tree'].astype(np.uint8).tobytes()) + self.bst = xgb.Booster() + self.bst.load_model(self.global_model) + + def save_native( + self, + filepath, + **kwargs, + ): + """Save XGB booster to file. + + Args: + filepath (str): Path to pickle file to be created by booster.save_model(). + **kwargs: Additional parameters. + + Returns: + None + """ + self.bst.save_model(filepath) def train_(self, train_dataloader) -> Metric: """Train model.""" diff --git a/openfl/interface/aggregation_functions/fed_bagging.py b/openfl/interface/aggregation_functions/fed_bagging.py index 336a14dd75..d67c977fbd 100644 --- a/openfl/interface/aggregation_functions/fed_bagging.py +++ b/openfl/interface/aggregation_functions/fed_bagging.py @@ -5,10 +5,20 @@ """Federated Boostrap Aggregation for XGBoost module.""" import json -from openfl.interface.aggregation_functions.core import AggregationFunction import numpy as np +from openfl.interface.aggregation_functions.core import AggregationFunction def get_global_model(iterator, target_round): + """ + Retrieves the global model for the specific round from an iterator. + + Parameters: + iterator (iterable): An iterable containing items with 'tags' and 'round' keys. + target_round (int): The round number for which the global model is to be retrieved. + + Returns: + np.ndarray: The numpy array representing the global model for the specified round. + """ for item in iterator: # Items tagged with ('model',) are the global model of that round if 'tags' in item and item['tags'] == ('model',) and item['round'] == target_round: @@ -17,7 +27,16 @@ def get_global_model(iterator, target_round): def append_trees(global_model, local_trees): + """ + Appends local trees to the global model. + + Parameters: + global_model (dict): A dictionary representing the global model. + local_trees (list): A list of dictionaries representing the local trees to be appended to the global model. + Returns: + dict: The updated global model with the local trees appended. + """ num_global_trees = int(global_model["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) num_local_trees = len(local_trees) @@ -71,7 +90,7 @@ def call(self, local_tensors, db_iterator, tensor_name, fl_round, *_): Returns: bytearray: aggregated tensor """ - # global_model = None + global_model = get_global_model(db_iterator, fl_round) if (isinstance(global_model, np.ndarray) and global_model.size == 0) or global_model is None: @@ -80,10 +99,10 @@ def call(self, local_tensors, db_iterator, tensor_name, fl_round, *_): local_tree_json = json.loads(local_tree_bytearray) if (isinstance(global_model, np.ndarray) and global_model.size == 0) or global_model is None: - # the first tree becomes the global model to append to + # the first tree becomes the global model global_model = local_tree_json else: - # append subsequent trees + # append subsequent trees to global model local_model = local_tree_json local_trees = local_model['learner']['gradient_booster']['model']['trees'] global_model = append_trees(global_model, local_trees) @@ -96,8 +115,6 @@ def call(self, local_tensors, db_iterator, tensor_name, fl_round, *_): local_trees = json.loads(local_tree_bytearray) global_model = append_trees(global_model, local_trees) - ## Ensures that model is recoverable. TODO put in function - # Convert latest_trees to a JSON string global_model_json = json.dumps(global_model) global_model_bytes = global_model_json.encode('utf-8') From 70e41716bad37b4169baa49ee062d1b5efae0672 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 15 Nov 2024 13:16:30 -0800 Subject: [PATCH 10/40] remove set_trace and fix spacing Signed-off-by: kta-intel --- .../workspace/plan/defaults/tasks_xgb.yaml | 14 +++++++------- openfl/component/aggregator/aggregator.py | 1 - 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/openfl-workspace/workspace/plan/defaults/tasks_xgb.yaml b/openfl-workspace/workspace/plan/defaults/tasks_xgb.yaml index b61942d4f7..7b14010eaa 100644 --- a/openfl-workspace/workspace/plan/defaults/tasks_xgb.yaml +++ b/openfl-workspace/workspace/plan/defaults/tasks_xgb.yaml @@ -1,21 +1,21 @@ aggregated_model_validation: function : validate_task - kwargs : - apply : global + kwargs : + apply : global metrics : - acc locally_tuned_model_validation: - function : validate_task - kwargs : - apply: local + function : validate_task + kwargs : + apply : local metrics : - acc train: function : train_task - kwargs : - metrics : + kwargs : + metrics : - loss aggregation_type : template : openfl.interface.aggregation_functions.FedBaggingXGBoost \ No newline at end of file diff --git a/openfl/component/aggregator/aggregator.py b/openfl/component/aggregator/aggregator.py index b3be9f7f63..5e0d6a3dc0 100644 --- a/openfl/component/aggregator/aggregator.py +++ b/openfl/component/aggregator/aggregator.py @@ -863,7 +863,6 @@ def _prepare_trained(self, tensor_name, origin, round_number, report, agg_result new_model_report, ("model",), ) - # import pdb; pdb.set_trace() # Finally, cache the updated model tensor self.tensor_db.cache_tensor({final_model_tk: new_model_nparray}) From 3d2df78c2f6ff0457f639754157fbb102e065ee9 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 15 Nov 2024 13:21:44 -0800 Subject: [PATCH 11/40] rename workspace and fix plan Signed-off-by: kta-intel --- openfl-workspace/{xgb => xgb_higgs}/.workspace | 0 openfl-workspace/{xgb => xgb_higgs}/plan/cols.yaml | 0 openfl-workspace/{xgb => xgb_higgs}/plan/data.yaml | 0 openfl-workspace/{xgb => xgb_higgs}/plan/defaults | 0 openfl-workspace/{xgb => xgb_higgs}/plan/plan.yaml | 2 +- openfl-workspace/{xgb => xgb_higgs}/requirements.txt | 0 openfl-workspace/{xgb => xgb_higgs}/src/__init__.py | 0 openfl-workspace/{xgb => xgb_higgs}/src/dataloader.py | 0 openfl-workspace/{xgb => xgb_higgs}/src/setup_data.py | 0 openfl-workspace/{xgb => xgb_higgs}/src/taskrunner.py | 0 10 files changed, 1 insertion(+), 1 deletion(-) rename openfl-workspace/{xgb => xgb_higgs}/.workspace (100%) rename openfl-workspace/{xgb => xgb_higgs}/plan/cols.yaml (100%) rename openfl-workspace/{xgb => xgb_higgs}/plan/data.yaml (100%) rename openfl-workspace/{xgb => xgb_higgs}/plan/defaults (100%) rename openfl-workspace/{xgb => xgb_higgs}/plan/plan.yaml (98%) rename openfl-workspace/{xgb => xgb_higgs}/requirements.txt (100%) rename openfl-workspace/{xgb => xgb_higgs}/src/__init__.py (100%) rename openfl-workspace/{xgb => xgb_higgs}/src/dataloader.py (100%) rename openfl-workspace/{xgb => xgb_higgs}/src/setup_data.py (100%) rename openfl-workspace/{xgb => xgb_higgs}/src/taskrunner.py (100%) diff --git a/openfl-workspace/xgb/.workspace b/openfl-workspace/xgb_higgs/.workspace similarity index 100% rename from openfl-workspace/xgb/.workspace rename to openfl-workspace/xgb_higgs/.workspace diff --git a/openfl-workspace/xgb/plan/cols.yaml b/openfl-workspace/xgb_higgs/plan/cols.yaml similarity index 100% rename from openfl-workspace/xgb/plan/cols.yaml rename to openfl-workspace/xgb_higgs/plan/cols.yaml diff --git a/openfl-workspace/xgb/plan/data.yaml b/openfl-workspace/xgb_higgs/plan/data.yaml similarity index 100% rename from openfl-workspace/xgb/plan/data.yaml rename to openfl-workspace/xgb_higgs/plan/data.yaml diff --git a/openfl-workspace/xgb/plan/defaults b/openfl-workspace/xgb_higgs/plan/defaults similarity index 100% rename from openfl-workspace/xgb/plan/defaults rename to openfl-workspace/xgb_higgs/plan/defaults diff --git a/openfl-workspace/xgb/plan/plan.yaml b/openfl-workspace/xgb_higgs/plan/plan.yaml similarity index 98% rename from openfl-workspace/xgb/plan/plan.yaml rename to openfl-workspace/xgb_higgs/plan/plan.yaml index 337005a36c..21bb44eb4c 100644 --- a/openfl-workspace/xgb/plan/plan.yaml +++ b/openfl-workspace/xgb_higgs/plan/plan.yaml @@ -8,7 +8,7 @@ aggregator : init_state_path : save/init.pbuf best_state_path : save/best.pbuf last_state_path : save/last.pbuf - rounds_to_train : 100 + rounds_to_train : 10 write_logs : false delta_updates : false diff --git a/openfl-workspace/xgb/requirements.txt b/openfl-workspace/xgb_higgs/requirements.txt similarity index 100% rename from openfl-workspace/xgb/requirements.txt rename to openfl-workspace/xgb_higgs/requirements.txt diff --git a/openfl-workspace/xgb/src/__init__.py b/openfl-workspace/xgb_higgs/src/__init__.py similarity index 100% rename from openfl-workspace/xgb/src/__init__.py rename to openfl-workspace/xgb_higgs/src/__init__.py diff --git a/openfl-workspace/xgb/src/dataloader.py b/openfl-workspace/xgb_higgs/src/dataloader.py similarity index 100% rename from openfl-workspace/xgb/src/dataloader.py rename to openfl-workspace/xgb_higgs/src/dataloader.py diff --git a/openfl-workspace/xgb/src/setup_data.py b/openfl-workspace/xgb_higgs/src/setup_data.py similarity index 100% rename from openfl-workspace/xgb/src/setup_data.py rename to openfl-workspace/xgb_higgs/src/setup_data.py diff --git a/openfl-workspace/xgb/src/taskrunner.py b/openfl-workspace/xgb_higgs/src/taskrunner.py similarity index 100% rename from openfl-workspace/xgb/src/taskrunner.py rename to openfl-workspace/xgb_higgs/src/taskrunner.py From 54cdc5ee1f95341d23f5c47008d11c21481ecda0 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 15 Nov 2024 13:24:55 -0800 Subject: [PATCH 12/40] fix lint Signed-off-by: kta-intel --- openfl-workspace/xgb_higgs/requirements.txt | 2 +- openfl-workspace/xgb_higgs/src/dataloader.py | 2 +- openfl-workspace/xgb_higgs/src/setup_data.py | 16 ++++++++-------- openfl-workspace/xgb_higgs/src/taskrunner.py | 6 +++--- openfl/federated/data/__init__.py | 1 - openfl/federated/task/__init__.py | 2 +- openfl/federated/task/runner_xgb.py | 16 ++++++++-------- .../aggregation_functions/fed_bagging.py | 10 +++++----- 8 files changed, 27 insertions(+), 28 deletions(-) diff --git a/openfl-workspace/xgb_higgs/requirements.txt b/openfl-workspace/xgb_higgs/requirements.txt index aa6b070230..d65559c90c 100644 --- a/openfl-workspace/xgb_higgs/requirements.txt +++ b/openfl-workspace/xgb_higgs/requirements.txt @@ -1,2 +1,2 @@ +scikit-learn xgboost -scikit-learn \ No newline at end of file diff --git a/openfl-workspace/xgb_higgs/src/dataloader.py b/openfl-workspace/xgb_higgs/src/dataloader.py index 3d792a91c9..f03e2cb94f 100644 --- a/openfl-workspace/xgb_higgs/src/dataloader.py +++ b/openfl-workspace/xgb_higgs/src/dataloader.py @@ -27,4 +27,4 @@ def load_Higgs(data_path, **kwargs): X_valid = valid_data.iloc[:, 1:].values y_valid = valid_data.iloc[:, 0].values - return X_train, y_train, X_valid, y_valid \ No newline at end of file + return X_train, y_train, X_valid, y_valid diff --git a/openfl-workspace/xgb_higgs/src/setup_data.py b/openfl-workspace/xgb_higgs/src/setup_data.py index 8aaf197c7e..d9c540a559 100644 --- a/openfl-workspace/xgb_higgs/src/setup_data.py +++ b/openfl-workspace/xgb_higgs/src/setup_data.py @@ -63,32 +63,32 @@ def main(): setup_data(src) collaborators = int(sys.argv[1]) print("Creating splits for {} collaborators".format(collaborators)) - + # Load the dataset higgs_data = pd.read_csv(path.join(src, CSV_FILENAME), header=None, nrows=1000000) - + # Split the dataset into features and labels X = higgs_data.iloc[:, 1:].values y = higgs_data.iloc[:, 0].values - + # Split the dataset into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) - + # Combine X and y for train and test sets train_data = pd.DataFrame(data=np.column_stack((y_train, X_train))) test_data = pd.DataFrame(data=np.column_stack((y_test, X_test))) - + # Split the training data into parts for each collaborator for i in range(collaborators): dst = f'data/{i+1}' makedirs(dst, exist_ok=True) - + # Split the training data for the current collaborator split_train_data = train_data.iloc[i::collaborators] split_train_data.to_csv(path.join(dst, 'train.csv'), index=False, header=False) - + # Copy the test data for the current collaborator test_data.to_csv(path.join(dst, 'valid.csv'), index=False, header=False) if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/openfl-workspace/xgb_higgs/src/taskrunner.py b/openfl-workspace/xgb_higgs/src/taskrunner.py index 1899b11da9..410c4f49c9 100644 --- a/openfl-workspace/xgb_higgs/src/taskrunner.py +++ b/openfl-workspace/xgb_higgs/src/taskrunner.py @@ -28,7 +28,7 @@ def __init__(self, params=None, num_rounds=1, **kwargs): """ super().__init__(**kwargs) - self.bst = None + self.bst = None self.params = params self.num_rounds = num_rounds @@ -37,8 +37,8 @@ def train_(self, train_dataloader) -> Metric: dtrain = train_dataloader['dmatrix'] evals = [(dtrain, 'train')] evals_result = {} - - self.bst = xgb.train(self.params, dtrain, self.num_rounds, xgb_model=self.bst, + + self.bst = xgb.train(self.params, dtrain, self.num_rounds, xgb_model=self.bst, evals=evals, evals_result=evals_result, verbose_eval=False) loss = evals_result['train']['logloss'][-1] diff --git a/openfl/federated/data/__init__.py b/openfl/federated/data/__init__.py index e09ff26cd5..91bb604b62 100644 --- a/openfl/federated/data/__init__.py +++ b/openfl/federated/data/__init__.py @@ -27,4 +27,3 @@ if importlib.util.find_spec("xgboost") is not None: from openfl.federated.data.federated_data import FederatedDataSet # NOQA from openfl.federated.data.loader_xgb import XGBoostDataLoader # NOQA - diff --git a/openfl/federated/task/__init__.py b/openfl/federated/task/__init__.py index 33058d8220..8b29264128 100644 --- a/openfl/federated/task/__init__.py +++ b/openfl/federated/task/__init__.py @@ -24,4 +24,4 @@ from openfl.federated.task.runner_pt import PyTorchTaskRunner # NOQA if importlib.util.find_spec("xgboost") is not None: from openfl.federated.task.fl_model import FederatedModel # NOQA - from openfl.federated.task.runner_xgb import XGBoostTaskRunner # NOQA \ No newline at end of file + from openfl.federated.task.runner_xgb import XGBoostTaskRunner # NOQA diff --git a/openfl/federated/task/runner_xgb.py b/openfl/federated/task/runner_xgb.py index bcdbe1ec16..517a05c19e 100644 --- a/openfl/federated/task/runner_xgb.py +++ b/openfl/federated/task/runner_xgb.py @@ -71,13 +71,13 @@ def validate_task(self, col_name, round_num, input_tensor_dict, **kwargs): # during agg validation, self.bst will still be None. during local validation, it will have a value - no need to rebuild if self.bst is None: self.rebuild_model(input_tensor_dict) - + # if self.bst is still None after rebuilding, then there was no initial global model, so set metric to 0 if self.bst is None: # for first round agg validation, there is no model so set metric to 0 # TODO: this is not robust, especially if using a loss metric metric = Metric(name="accuracy", value=np.array(0)) - else: + else: metric = self.validate_(loader) origin = col_name @@ -128,7 +128,7 @@ def train_task( } # output model tensors (Doesn't include TensorKey) - output_model_dict = self.get_tensor_dict() + output_model_dict = self.get_tensor_dict() global_model_dict, local_model_dict = split_tensor_dict_for_holdouts( self.logger, output_model_dict, **self.tensor_dict_split_fn_kwargs ) @@ -137,7 +137,7 @@ def train_task( global_tensorkey_model_dict = { TensorKey(tensor_name, origin, round_num, False, tags): nparray for tensor_name, nparray in global_model_dict.items() - } + } # Create tensorkeys that should stay local local_tensorkey_model_dict = { TensorKey(tensor_name, origin, round_num, False, tags): nparray @@ -181,7 +181,7 @@ def train_task( # import pdb; pdb.set_trace() #TODO it is still decodable from here with .tobytes().decode('utf-8') return global_tensor_dict, local_tensor_dict - + def get_tensor_dict(self, with_opt_vars=False): """ Retrieves the tensor dictionary containing the model's tree structure. @@ -222,7 +222,7 @@ def get_tensor_dict(self, with_opt_vars=False): latest_trees_float32_array = np.frombuffer(latest_trees_bytes, dtype=np.uint8).astype(np.float32) return {'local_tree': latest_trees_float32_array} - + def get_required_tensorkeys_for_function(self, func_name, **kwargs): """Get the required tensors for specified function that could be called @@ -341,8 +341,8 @@ def train_(self, train_dataloader) -> Metric: dtrain = train_dataloader['dmatrix'] evals = [(dtrain, 'train')] evals_result = {} - - self.bst = xgb.train(self.params, dtrain, self.num_rounds, xgb_model=self.bst, + + self.bst = xgb.train(self.params, dtrain, self.num_rounds, xgb_model=self.bst, evals=evals, evals_result=evals_result, verbose_eval=False) loss = evals_result['train']['logloss'][-1] diff --git a/openfl/interface/aggregation_functions/fed_bagging.py b/openfl/interface/aggregation_functions/fed_bagging.py index d67c977fbd..179031a0ec 100644 --- a/openfl/interface/aggregation_functions/fed_bagging.py +++ b/openfl/interface/aggregation_functions/fed_bagging.py @@ -25,7 +25,7 @@ def get_global_model(iterator, target_round): return item['nparray'] raise ValueError(f"No item found with tag 'model' and round {target_round}") - + def append_trees(global_model, local_trees): """ Appends local trees to the global model. @@ -92,12 +92,12 @@ def call(self, local_tensors, db_iterator, tensor_name, fl_round, *_): """ global_model = get_global_model(db_iterator, fl_round) - + if (isinstance(global_model, np.ndarray) and global_model.size == 0) or global_model is None: for local_tensor in local_tensors: local_tree_bytearray = bytearray(local_tensor.tensor.astype(np.uint8).tobytes()) local_tree_json = json.loads(local_tree_bytearray) - + if (isinstance(global_model, np.ndarray) and global_model.size == 0) or global_model is None: # the first tree becomes the global model global_model = local_tree_json @@ -117,5 +117,5 @@ def call(self, local_tensors, db_iterator, tensor_name, fl_round, *_): global_model_json = json.dumps(global_model) global_model_bytes = global_model_json.encode('utf-8') - - return np.frombuffer(global_model_bytes, dtype=np.uint8).astype(np.float32) \ No newline at end of file + + return np.frombuffer(global_model_bytes, dtype=np.uint8).astype(np.float32) From 51a0afafe04f5269a542d2de3473f99e51beb0b7 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 15 Nov 2024 13:25:25 -0800 Subject: [PATCH 13/40] more formatting fixes Signed-off-by: kta-intel --- openfl/federated/data/loader_xgb.py | 14 +- openfl/federated/task/runner_xgb.py | 120 +++++++++++------- .../aggregation_functions/__init__.py | 2 +- .../aggregation_functions/fed_bagging.py | 29 +++-- 4 files changed, 96 insertions(+), 69 deletions(-) diff --git a/openfl/federated/data/loader_xgb.py b/openfl/federated/data/loader_xgb.py index 99a1a33ec5..73f80e49e4 100644 --- a/openfl/federated/data/loader_xgb.py +++ b/openfl/federated/data/loader_xgb.py @@ -1,6 +1,8 @@ +from math import ceil + import numpy as np import xgboost as xgb -from math import ceil + class XGBoostDataLoader: """A class used to represent a Data Loader for XGBoost models. @@ -155,10 +157,7 @@ def get_train_dmatrix(self): Returns: xgb.DMatrix: The DMatrix object for the training data. """ - return { - 'dmatrix': self.get_dmatrix(self.X_train, self.y_train), - 'labels': self.y_train - } + return {"dmatrix": self.get_dmatrix(self.X_train, self.y_train), "labels": self.y_train} def get_valid_dmatrix(self): """Returns the DMatrix for the validation data. @@ -166,7 +165,4 @@ def get_valid_dmatrix(self): Returns: xgb.DMatrix: The DMatrix object for the validation data. """ - return { - 'dmatrix': self.get_dmatrix(self.X_valid, self.y_valid), - 'labels': self.y_valid - } + return {"dmatrix": self.get_dmatrix(self.X_valid, self.y_valid), "labels": self.y_valid} diff --git a/openfl/federated/task/runner_xgb.py b/openfl/federated/task/runner_xgb.py index 517a05c19e..cdc2972a87 100644 --- a/openfl/federated/task/runner_xgb.py +++ b/openfl/federated/task/runner_xgb.py @@ -7,17 +7,16 @@ # from copy import deepcopy # from typing import Iterator, Tuple -import numpy as np import json +import numpy as np +import xgboost as xgb +from sklearn.metrics import accuracy_score + from openfl.federated.task.runner import TaskRunner from openfl.utilities import Metric, TensorKey, change_tags from openfl.utilities.split import split_tensor_dict_for_holdouts -import xgboost as xgb -import json -from sklearn.metrics import accuracy_score - class XGBoostTaskRunner(TaskRunner): def __init__(self, **kwargs): @@ -46,8 +45,13 @@ def rebuild_model(self, input_tensor_dict): Returns: None """ - if (isinstance(input_tensor_dict['local_tree'], np.ndarray) and input_tensor_dict['local_tree'].size != 0) \ - or (not isinstance(input_tensor_dict['local_tree'], np.ndarray) and input_tensor_dict['local_tree'] is not None): + if ( + isinstance(input_tensor_dict["local_tree"], np.ndarray) + and input_tensor_dict["local_tree"].size != 0 + ) or ( + not isinstance(input_tensor_dict["local_tree"], np.ndarray) + and input_tensor_dict["local_tree"] is not None + ): self.set_tensor_dict(input_tensor_dict) def validate_task(self, col_name, round_num, input_tensor_dict, **kwargs): @@ -179,50 +183,61 @@ def train_task( # Return global_tensor_dict, local_tensor_dict # import pdb; pdb.set_trace() - #TODO it is still decodable from here with .tobytes().decode('utf-8') + # TODO it is still decodable from here with .tobytes().decode('utf-8') return global_tensor_dict, local_tensor_dict def get_tensor_dict(self, with_opt_vars=False): - """ - Retrieves the tensor dictionary containing the model's tree structure. - - This method returns a dictionary with the key 'local_tree', which contains the model's tree structure as a numpy array. - If the model has not been initialized (`self.bst` is None), it returns an empty numpy array. - If the global model is not set or is empty, it returns the entire model as a numpy array. - Otherwise, it returns only the trees added in the latest training session. - - Parameters: - with_opt_vars (bool): N/A for XGBoost (Default=False). - - Returns: - dict: A dictionary with the key 'local_tree' containing the model's tree structure as a numpy array. - """ - - if self.bst is None: - # For initializing tensor dict - return {'local_tree': np.array([], dtype=np.float32)} + """ + Retrieves the tensor dictionary containing the model's tree structure. - booster_array = self.bst.save_raw('json') - booster_dict = json.loads(booster_array) + This method returns a dictionary with the key 'local_tree', which contains the model's tree structure as a numpy array. + If the model has not been initialized (`self.bst` is None), it returns an empty numpy array. + If the global model is not set or is empty, it returns the entire model as a numpy array. + Otherwise, it returns only the trees added in the latest training session. - if (isinstance(self.global_model, np.ndarray) and self.global_model.size == 0) or self.global_model is None: - booster_float32_array = np.frombuffer(booster_array, dtype=np.uint8).astype(np.float32) - return {'local_tree': booster_float32_array} + Parameters: + with_opt_vars (bool): N/A for XGBoost (Default=False). - global_model_booster_dict = json.loads(self.global_model) - num_global_trees = int(global_model_booster_dict["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) - num_total_trees = int(booster_dict["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) + Returns: + dict: A dictionary with the key 'local_tree' containing the model's tree structure as a numpy array. + """ - # Calculate the number of trees added in the latest training - num_latest_trees = num_total_trees - num_global_trees - latest_trees = booster_dict['learner']['gradient_booster']['model']['trees'][-num_latest_trees:] + if self.bst is None: + # For initializing tensor dict + return {"local_tree": np.array([], dtype=np.float32)} + + booster_array = self.bst.save_raw("json") + booster_dict = json.loads(booster_array) + + if ( + isinstance(self.global_model, np.ndarray) and self.global_model.size == 0 + ) or self.global_model is None: + booster_float32_array = np.frombuffer(booster_array, dtype=np.uint8).astype(np.float32) + return {"local_tree": booster_float32_array} + + global_model_booster_dict = json.loads(self.global_model) + num_global_trees = int( + global_model_booster_dict["learner"]["gradient_booster"]["model"]["gbtree_model_param"][ + "num_trees" + ] + ) + num_total_trees = int( + booster_dict["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"] + ) - latest_trees_json = json.dumps(latest_trees) - latest_trees_bytes = latest_trees_json.encode('utf-8') - latest_trees_float32_array = np.frombuffer(latest_trees_bytes, dtype=np.uint8).astype(np.float32) + # Calculate the number of trees added in the latest training + num_latest_trees = num_total_trees - num_global_trees + latest_trees = booster_dict["learner"]["gradient_booster"]["model"]["trees"][ + -num_latest_trees: + ] - return {'local_tree': latest_trees_float32_array} + latest_trees_json = json.dumps(latest_trees) + latest_trees_bytes = latest_trees_json.encode("utf-8") + latest_trees_float32_array = np.frombuffer(latest_trees_bytes, dtype=np.uint8).astype( + np.float32 + ) + return {"local_tree": latest_trees_float32_array} def get_required_tensorkeys_for_function(self, func_name, **kwargs): """Get the required tensors for specified function that could be called @@ -316,7 +331,7 @@ def set_tensor_dict(self, tensor_dict, with_opt_vars=False): with_opt_vars (bool): N/A for XGBoost (Default=False). """ # The with_opt_vars argument is not used in this method - self.global_model = bytearray(tensor_dict['local_tree'].astype(np.uint8).tobytes()) + self.global_model = bytearray(tensor_dict["local_tree"].astype(np.uint8).tobytes()) self.bst = xgb.Booster() self.bst.load_model(self.global_model) @@ -338,21 +353,28 @@ def save_native( def train_(self, train_dataloader) -> Metric: """Train model.""" - dtrain = train_dataloader['dmatrix'] - evals = [(dtrain, 'train')] + dtrain = train_dataloader["dmatrix"] + evals = [(dtrain, "train")] evals_result = {} - self.bst = xgb.train(self.params, dtrain, self.num_rounds, xgb_model=self.bst, - evals=evals, evals_result=evals_result, verbose_eval=False) + self.bst = xgb.train( + self.params, + dtrain, + self.num_rounds, + xgb_model=self.bst, + evals=evals, + evals_result=evals_result, + verbose_eval=False, + ) - loss = evals_result['train']['logloss'][-1] + loss = evals_result["train"]["logloss"][-1] return Metric(name=self.loss_fn.__name__, value=np.array(loss)) def validate_(self, validation_dataloader) -> Metric: """Validate model.""" - dtest = validation_dataloader['dmatrix'] - y_test = validation_dataloader['labels'] + dtest = validation_dataloader["dmatrix"] + y_test = validation_dataloader["labels"] preds = self.bst.predict(dtest) y_pred_binary = np.where(preds > 0.5, 1, 0) acc = accuracy_score(y_test, y_pred_binary) diff --git a/openfl/interface/aggregation_functions/__init__.py b/openfl/interface/aggregation_functions/__init__.py index 1ddb7d0f25..0ee32655c6 100644 --- a/openfl/interface/aggregation_functions/__init__.py +++ b/openfl/interface/aggregation_functions/__init__.py @@ -7,9 +7,9 @@ ) from openfl.interface.aggregation_functions.adam_adaptive_aggregation import AdamAdaptiveAggregation from openfl.interface.aggregation_functions.core import AggregationFunction +from openfl.interface.aggregation_functions.fed_bagging import FedBaggingXGBoost from openfl.interface.aggregation_functions.fedcurv_weighted_average import FedCurvWeightedAverage from openfl.interface.aggregation_functions.geometric_median import GeometricMedian from openfl.interface.aggregation_functions.median import Median from openfl.interface.aggregation_functions.weighted_average import WeightedAverage from openfl.interface.aggregation_functions.yogi_adaptive_aggregation import YogiAdaptiveAggregation -from openfl.interface.aggregation_functions.fed_bagging import FedBaggingXGBoost diff --git a/openfl/interface/aggregation_functions/fed_bagging.py b/openfl/interface/aggregation_functions/fed_bagging.py index 179031a0ec..aaec86fb24 100644 --- a/openfl/interface/aggregation_functions/fed_bagging.py +++ b/openfl/interface/aggregation_functions/fed_bagging.py @@ -5,9 +5,12 @@ """Federated Boostrap Aggregation for XGBoost module.""" import json + import numpy as np + from openfl.interface.aggregation_functions.core import AggregationFunction + def get_global_model(iterator, target_round): """ Retrieves the global model for the specific round from an iterator. @@ -21,8 +24,8 @@ def get_global_model(iterator, target_round): """ for item in iterator: # Items tagged with ('model',) are the global model of that round - if 'tags' in item and item['tags'] == ('model',) and item['round'] == target_round: - return item['nparray'] + if "tags" in item and item["tags"] == ("model",) and item["round"] == target_round: + return item["nparray"] raise ValueError(f"No item found with tag 'model' and round {target_round}") @@ -37,7 +40,9 @@ def append_trees(global_model, local_trees): Returns: dict: The updated global model with the local trees appended. """ - num_global_trees = int(global_model["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"]) + num_global_trees = int( + global_model["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"] + ) num_local_trees = len(local_trees) global_model["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"] = str( @@ -47,9 +52,9 @@ def append_trees(global_model, local_trees): num_global_trees + num_local_trees ) for new_tree in range(num_local_trees): - local_trees[new_tree]["id"] = num_global_trees + new_tree - global_model["learner"]["gradient_booster"]["model"]["trees"].append(local_trees[new_tree]) - global_model["learner"]["gradient_booster"]["model"]["tree_info"].append(0) + local_trees[new_tree]["id"] = num_global_trees + new_tree + global_model["learner"]["gradient_booster"]["model"]["trees"].append(local_trees[new_tree]) + global_model["learner"]["gradient_booster"]["model"]["tree_info"].append(0) return global_model @@ -93,18 +98,22 @@ def call(self, local_tensors, db_iterator, tensor_name, fl_round, *_): global_model = get_global_model(db_iterator, fl_round) - if (isinstance(global_model, np.ndarray) and global_model.size == 0) or global_model is None: + if ( + isinstance(global_model, np.ndarray) and global_model.size == 0 + ) or global_model is None: for local_tensor in local_tensors: local_tree_bytearray = bytearray(local_tensor.tensor.astype(np.uint8).tobytes()) local_tree_json = json.loads(local_tree_bytearray) - if (isinstance(global_model, np.ndarray) and global_model.size == 0) or global_model is None: + if ( + isinstance(global_model, np.ndarray) and global_model.size == 0 + ) or global_model is None: # the first tree becomes the global model global_model = local_tree_json else: # append subsequent trees to global model local_model = local_tree_json - local_trees = local_model['learner']['gradient_booster']['model']['trees'] + local_trees = local_model["learner"]["gradient_booster"]["model"]["trees"] global_model = append_trees(global_model, local_trees) else: global_model_bytearray = bytearray(global_model.astype(np.uint8).tobytes()) @@ -116,6 +125,6 @@ def call(self, local_tensors, db_iterator, tensor_name, fl_round, *_): global_model = append_trees(global_model, local_trees) global_model_json = json.dumps(global_model) - global_model_bytes = global_model_json.encode('utf-8') + global_model_bytes = global_model_json.encode("utf-8") return np.frombuffer(global_model_bytes, dtype=np.uint8).astype(np.float32) From d3937efc54c3740ec9d7bed4b0319971bed9f742 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 15 Nov 2024 13:41:36 -0800 Subject: [PATCH 14/40] revert space removal Signed-off-by: kta-intel --- openfl/interface/plan.py | 1 + 1 file changed, 1 insertion(+) diff --git a/openfl/interface/plan.py b/openfl/interface/plan.py index d235ad3070..a5e2fd9b2f 100644 --- a/openfl/interface/plan.py +++ b/openfl/interface/plan.py @@ -172,6 +172,7 @@ def initialize( ) data_loader = get_dataloader(plan, prefer_minimal=True, input_shape=input_shape) + task_runner = plan.get_task_runner(data_loader) tensor_pipe = plan.get_tensor_pipe() From dd2027c12b924851bb52968a4bc5309d80eddf5a Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 15 Nov 2024 13:46:50 -0800 Subject: [PATCH 15/40] Revert "revert space removal" This reverts commit d3937efc54c3740ec9d7bed4b0319971bed9f742. Signed-off-by: kta-intel --- openfl/interface/plan.py | 1 - 1 file changed, 1 deletion(-) diff --git a/openfl/interface/plan.py b/openfl/interface/plan.py index a5e2fd9b2f..d235ad3070 100644 --- a/openfl/interface/plan.py +++ b/openfl/interface/plan.py @@ -172,7 +172,6 @@ def initialize( ) data_loader = get_dataloader(plan, prefer_minimal=True, input_shape=input_shape) - task_runner = plan.get_task_runner(data_loader) tensor_pipe = plan.get_tensor_pipe() From e008e4ad9671dc641206a1dc797df497f8295f2b Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 15 Nov 2024 13:53:55 -0800 Subject: [PATCH 16/40] revert changes on interface.plan Signed-off-by: kta-intel --- openfl/interface/plan.py | 1 + 1 file changed, 1 insertion(+) diff --git a/openfl/interface/plan.py b/openfl/interface/plan.py index d235ad3070..a5e2fd9b2f 100644 --- a/openfl/interface/plan.py +++ b/openfl/interface/plan.py @@ -172,6 +172,7 @@ def initialize( ) data_loader = get_dataloader(plan, prefer_minimal=True, input_shape=input_shape) + task_runner = plan.get_task_runner(data_loader) tensor_pipe = plan.get_tensor_pipe() From 3cbd5e5625b6f54e1ac6663c53da6f3f88d3141c Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 15 Nov 2024 13:56:40 -0800 Subject: [PATCH 17/40] remove from history. unchanged Signed-off-by: kta-intel --- openfl/interface/plan.py | 1 - 1 file changed, 1 deletion(-) diff --git a/openfl/interface/plan.py b/openfl/interface/plan.py index a5e2fd9b2f..d235ad3070 100644 --- a/openfl/interface/plan.py +++ b/openfl/interface/plan.py @@ -172,7 +172,6 @@ def initialize( ) data_loader = get_dataloader(plan, prefer_minimal=True, input_shape=input_shape) - task_runner = plan.get_task_runner(data_loader) tensor_pipe = plan.get_tensor_pipe() From 051d8fcda5b56d5e82202b33ff1c757c86db343a Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 15 Nov 2024 13:57:58 -0800 Subject: [PATCH 18/40] reverting back to fresh state for interface.plan Signed-off-by: kta-intel --- openfl/interface/plan.py | 1 + 1 file changed, 1 insertion(+) diff --git a/openfl/interface/plan.py b/openfl/interface/plan.py index d235ad3070..f4c91faed0 100644 --- a/openfl/interface/plan.py +++ b/openfl/interface/plan.py @@ -172,6 +172,7 @@ def initialize( ) data_loader = get_dataloader(plan, prefer_minimal=True, input_shape=input_shape) + task_runner = plan.get_task_runner(data_loader) tensor_pipe = plan.get_tensor_pipe() From a8d9b5997f9cb654c0787a4234f3ac339345d322 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 15 Nov 2024 14:36:54 -0800 Subject: [PATCH 19/40] move delta_updates below assigner in args Signed-off-by: kta-intel --- openfl/component/aggregator/aggregator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openfl/component/aggregator/aggregator.py b/openfl/component/aggregator/aggregator.py index 5e0d6a3dc0..780fed6a81 100644 --- a/openfl/component/aggregator/aggregator.py +++ b/openfl/component/aggregator/aggregator.py @@ -68,8 +68,8 @@ def __init__( init_state_path, best_state_path, last_state_path, - delta_updates, assigner, + delta_updates, straggler_handling_policy=None, rounds_to_train=256, single_col_cert_common_name=None, From 5f1d909ad1da6340e49f75568aa5ff0b0a46cb4a Mon Sep 17 00:00:00 2001 From: kta-intel Date: Fri, 15 Nov 2024 14:44:18 -0800 Subject: [PATCH 20/40] add delta_update default to True, remove from yaml Signed-off-by: kta-intel --- openfl-workspace/workspace/plan/defaults/aggregator.yaml | 1 - openfl/component/aggregator/aggregator.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/openfl-workspace/workspace/plan/defaults/aggregator.yaml b/openfl-workspace/workspace/plan/defaults/aggregator.yaml index aac5a27f39..0bb76e099d 100644 --- a/openfl-workspace/workspace/plan/defaults/aggregator.yaml +++ b/openfl-workspace/workspace/plan/defaults/aggregator.yaml @@ -2,4 +2,3 @@ template : openfl.component.Aggregator settings : db_store_rounds : 2 write_logs : true - delta_updates : true diff --git a/openfl/component/aggregator/aggregator.py b/openfl/component/aggregator/aggregator.py index 780fed6a81..ea04b5fe88 100644 --- a/openfl/component/aggregator/aggregator.py +++ b/openfl/component/aggregator/aggregator.py @@ -69,7 +69,7 @@ def __init__( best_state_path, last_state_path, assigner, - delta_updates, + delta_updates=True, straggler_handling_policy=None, rounds_to_train=256, single_col_cert_common_name=None, From 3670bd04c263a68d87e0bb6e4b65984313695e39 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Sat, 16 Nov 2024 06:03:28 -0800 Subject: [PATCH 21/40] enable modin pandas Signed-off-by: kta-intel --- openfl-workspace/xgb_higgs/requirements.txt | 1 + openfl-workspace/xgb_higgs/src/dataloader.py | 2 +- openfl-workspace/xgb_higgs/src/setup_data.py | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/openfl-workspace/xgb_higgs/requirements.txt b/openfl-workspace/xgb_higgs/requirements.txt index d65559c90c..797917eff2 100644 --- a/openfl-workspace/xgb_higgs/requirements.txt +++ b/openfl-workspace/xgb_higgs/requirements.txt @@ -1,2 +1,3 @@ scikit-learn xgboost +modin[all] diff --git a/openfl-workspace/xgb_higgs/src/dataloader.py b/openfl-workspace/xgb_higgs/src/dataloader.py index f03e2cb94f..2d472ac265 100644 --- a/openfl-workspace/xgb_higgs/src/dataloader.py +++ b/openfl-workspace/xgb_higgs/src/dataloader.py @@ -4,7 +4,7 @@ from openfl.federated import XGBoostDataLoader import os -import pandas as pd +import modin.pandas as pd class HiggsDataLoader(XGBoostDataLoader): def __init__(self, data_path, **kwargs): diff --git a/openfl-workspace/xgb_higgs/src/setup_data.py b/openfl-workspace/xgb_higgs/src/setup_data.py index d9c540a559..96c8bbdc0a 100644 --- a/openfl-workspace/xgb_higgs/src/setup_data.py +++ b/openfl-workspace/xgb_higgs/src/setup_data.py @@ -6,7 +6,7 @@ from hashlib import sha384 from os import path, makedirs from tqdm import tqdm -import pandas as pd +import modin.pandas as pd import gzip from sklearn.model_selection import train_test_split import numpy as np @@ -65,7 +65,7 @@ def main(): print("Creating splits for {} collaborators".format(collaborators)) # Load the dataset - higgs_data = pd.read_csv(path.join(src, CSV_FILENAME), header=None, nrows=1000000) + higgs_data = pd.read_csv(path.join(src, CSV_FILENAME), header=None) # Split the dataset into features and labels X = higgs_data.iloc[:, 1:].values From dcfdd70511f9972e076f792431641f68ec5929f6 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 18 Nov 2024 08:04:20 -0800 Subject: [PATCH 22/40] add DO NOT EDIT notice Signed-off-by: kta-intel --- openfl-workspace/xgb_higgs/plan/cols.yaml | 2 +- openfl-workspace/xgb_higgs/plan/data.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/openfl-workspace/xgb_higgs/plan/cols.yaml b/openfl-workspace/xgb_higgs/plan/cols.yaml index 5b0f52178d..b085067f50 100644 --- a/openfl-workspace/xgb_higgs/plan/cols.yaml +++ b/openfl-workspace/xgb_higgs/plan/cols.yaml @@ -1,5 +1,5 @@ # Copyright (C) 2024 Intel Corporation # Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you. -# This file lists the collaborators associated with the federation. The list will be auto-populated during collaborator creation. +# DO NOT EDIT: This file lists the collaborators associated with the federation. The list will be auto-populated during collaborator creation. collaborators: diff --git a/openfl-workspace/xgb_higgs/plan/data.yaml b/openfl-workspace/xgb_higgs/plan/data.yaml index a6825c5ab1..4b9d070127 100644 --- a/openfl-workspace/xgb_higgs/plan/data.yaml +++ b/openfl-workspace/xgb_higgs/plan/data.yaml @@ -1,5 +1,5 @@ # Copyright (C) 2024 Intel Corporation # Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you. -# This file specifies the local data directory associated with the respective collaborator. This will be auto-populated during collaborator creation +# DO NOT EDIT: This file specifies the local data directory associated with the respective collaborator. This will be auto-populated during collaborator creation # collaborator_name,data_directory_path \ No newline at end of file From bd03eac2cc9e54f8182af3f9d2d8437dd133195f Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 18 Nov 2024 08:04:50 -0800 Subject: [PATCH 23/40] added docstrings Signed-off-by: kta-intel --- openfl-workspace/xgb_higgs/src/dataloader.py | 32 +++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/openfl-workspace/xgb_higgs/src/dataloader.py b/openfl-workspace/xgb_higgs/src/dataloader.py index 2d472ac265..47416cbefa 100644 --- a/openfl-workspace/xgb_higgs/src/dataloader.py +++ b/openfl-workspace/xgb_higgs/src/dataloader.py @@ -7,6 +7,18 @@ import modin.pandas as pd class HiggsDataLoader(XGBoostDataLoader): + """ + DataLoader for the Higgs dataset. + + This class inherits from XGBoostDataLoader and is responsible for loading + the Higgs dataset for training and validation. + + Attributes: + X_train (numpy.ndarray): Training features. + y_train (numpy.ndarray): Training labels. + X_valid (numpy.ndarray): Validation features. + y_valid (numpy.ndarray): Validation labels. + """ def __init__(self, data_path, **kwargs): super().__init__(**kwargs) X_train, y_train, X_valid, y_valid = load_Higgs( @@ -19,6 +31,24 @@ def __init__(self, data_path, **kwargs): def load_Higgs(data_path, **kwargs): + """ + Load the Higgs dataset from CSV files. + + The dataset is expected to be in two CSV files: 'train.csv' and 'valid.csv'. + The first column in each file represents the labels, and the remaining + columns represent the features. + + Args: + data_path (str): The directory path where the CSV files are located. + **kwargs: Additional keyword arguments. + + Returns: + tuple: A tuple containing four elements: + - X_train (numpy.ndarray): Training features. + - y_train (numpy.ndarray): Training labels. + - X_valid (numpy.ndarray): Validation features. + - y_valid (numpy.ndarray): Validation labels. + """ train_data = pd.read_csv(os.path.join(data_path, 'train.csv'), header=None) X_train = train_data.iloc[:, 1:].values y_train = train_data.iloc[:, 0].values @@ -27,4 +57,4 @@ def load_Higgs(data_path, **kwargs): X_valid = valid_data.iloc[:, 1:].values y_valid = valid_data.iloc[:, 0].values - return X_train, y_train, X_valid, y_valid + return X_train, y_train, X_valid, y_valid \ No newline at end of file From 326069dc3a614bdad82f194485fd3e0d9e4e294c Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 18 Nov 2024 08:06:11 -0800 Subject: [PATCH 24/40] set DEFAULT_PATH to cwd Signed-off-by: kta-intel --- openfl-workspace/xgb_higgs/src/setup_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openfl-workspace/xgb_higgs/src/setup_data.py b/openfl-workspace/xgb_higgs/src/setup_data.py index 96c8bbdc0a..b160e96c36 100644 --- a/openfl-workspace/xgb_higgs/src/setup_data.py +++ b/openfl-workspace/xgb_higgs/src/setup_data.py @@ -19,7 +19,7 @@ FILENAME = "HIGGS.csv.gz" CSV_FILENAME = "HIGGS.csv" CSV_SHA384 = 'b8b82e11a78b81601381420878ad42ba557291f394a88dc5293e4077c8363c87429639b120e299a2a9939c1f943b6a63' -DEFAULT_PATH = path.join(path.expanduser('~'), '.openfl', 'data') +DEFAULT_PATH = path.join(os.getcwd(), 'data') pbar = tqdm(total=None) From 8a75cc57b176604e874af5b8fa288b86e4780881 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 18 Nov 2024 08:06:43 -0800 Subject: [PATCH 25/40] fix docstrings and remove commented out lines Signed-off-by: kta-intel --- openfl-workspace/xgb_higgs/src/taskrunner.py | 39 +++++++++++++++----- openfl/federated/task/runner_xgb.py | 34 +++++++++++++---- 2 files changed, 56 insertions(+), 17 deletions(-) diff --git a/openfl-workspace/xgb_higgs/src/taskrunner.py b/openfl-workspace/xgb_higgs/src/taskrunner.py index 410c4f49c9..520e303be2 100644 --- a/openfl-workspace/xgb_higgs/src/taskrunner.py +++ b/openfl-workspace/xgb_higgs/src/taskrunner.py @@ -13,18 +13,24 @@ class XGBoostRunner(XGBoostTaskRunner): """ - Simple CNN for classification. + A class to run XGBoost training and validation tasks. - PyTorchTaskRunner inherits from nn.module, so you can define your model - in the same way that you would for PyTorch - """ + This class inherits from XGBoostTaskRunner and provides methods to train and validate + an XGBoost model using federated learning. + Attributes: + bst (xgb.Booster): The XGBoost model. + params (dict): Parameters for the XGBoost model. + num_rounds (int): Number of boosting rounds. + """ def __init__(self, params=None, num_rounds=1, **kwargs): - """Initialize. + """ + Initialize the XGBoostRunner. Args: - **kwargs: Additional arguments to pass to the function - + params (dict, optional): Parameters for the XGBoost model. Defaults to None. + num_rounds (int, optional): Number of boosting rounds. Defaults to 1. + **kwargs: Additional arguments to pass to the function. """ super().__init__(**kwargs) @@ -33,7 +39,15 @@ def __init__(self, params=None, num_rounds=1, **kwargs): self.num_rounds = num_rounds def train_(self, train_dataloader) -> Metric: - """Train model.""" + """ + Train the XGBoost model. + + Args: + train_dataloader (dict): A dictionary containing the training data with keys 'dmatrix'. + + Returns: + Metric: A Metric object containing the training loss. + """ dtrain = train_dataloader['dmatrix'] evals = [(dtrain, 'train')] evals_result = {} @@ -45,8 +59,15 @@ def train_(self, train_dataloader) -> Metric: return Metric(name=self.params['eval_metric'], value=np.array(loss)) def validate_(self, validation_dataloader) -> Metric: - """Validate model.""" + """ + Validate the XGBoost model. + Args: + validation_dataloader (dict): A dictionary containing the validation data with keys 'dmatrix' and 'labels'. + + Returns: + Metric: A Metric object containing the validation accuracy. + """ dtest = validation_dataloader['dmatrix'] y_test = validation_dataloader['labels'] preds = self.bst.predict(dtest) diff --git a/openfl/federated/task/runner_xgb.py b/openfl/federated/task/runner_xgb.py index cdc2972a87..782a5b8213 100644 --- a/openfl/federated/task/runner_xgb.py +++ b/openfl/federated/task/runner_xgb.py @@ -4,9 +4,6 @@ """XGBoostTaskRunner module.""" -# from copy import deepcopy -# from typing import Iterator, Tuple - import json import numpy as np @@ -20,10 +17,16 @@ class XGBoostTaskRunner(TaskRunner): def __init__(self, **kwargs): - """Initializes the XGBoostTaskRunner object. + """ + A class to manage XGBoost tasks in a federated learning environment. - Args: - **kwargs: Additional parameters to pass to the functions. + This class inherits from TaskRunner and provides methods to initialize and manage + the global model and required tensor keys for XGBoost tasks. + + Attributes: + global_model (xgb.Booster): The global XGBoost model. + required_tensorkeys_for_function (dict): A dictionary to store required tensor keys for each function. + training_round_completed (bool): A flag to indicate if the training round is completed. """ super().__init__(**kwargs) self.global_model = None @@ -352,7 +355,15 @@ def save_native( self.bst.save_model(filepath) def train_(self, train_dataloader) -> Metric: - """Train model.""" + """ + Train the XGBoost model. + + Args: + train_dataloader (dict): A dictionary containing the training data with keys 'dmatrix'. + + Returns: + Metric: A Metric object containing the training loss. + """ dtrain = train_dataloader["dmatrix"] evals = [(dtrain, "train")] evals_result = {} @@ -371,8 +382,15 @@ def train_(self, train_dataloader) -> Metric: return Metric(name=self.loss_fn.__name__, value=np.array(loss)) def validate_(self, validation_dataloader) -> Metric: - """Validate model.""" + """ + Validate the XGBoost model. + Args: + validation_dataloader (dict): A dictionary containing the validation data with keys 'dmatrix' and 'labels'. + + Returns: + Metric: A Metric object containing the validation accuracy. + """ dtest = validation_dataloader["dmatrix"] y_test = validation_dataloader["labels"] preds = self.bst.predict(dtest) From 450d8c39fe6b092e5cf80ccd0f29e31387a12185 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 18 Nov 2024 08:07:15 -0800 Subject: [PATCH 26/40] change to use_delta_updates for readibility Signed-off-by: kta-intel --- openfl-workspace/xgb_higgs/plan/plan.yaml | 2 +- openfl/component/aggregator/aggregator.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/openfl-workspace/xgb_higgs/plan/plan.yaml b/openfl-workspace/xgb_higgs/plan/plan.yaml index 21bb44eb4c..cab8710cb4 100644 --- a/openfl-workspace/xgb_higgs/plan/plan.yaml +++ b/openfl-workspace/xgb_higgs/plan/plan.yaml @@ -10,7 +10,7 @@ aggregator : last_state_path : save/last.pbuf rounds_to_train : 10 write_logs : false - delta_updates : false + use_delta_updates : false collaborator : defaults : plan/defaults/collaborator.yaml diff --git a/openfl/component/aggregator/aggregator.py b/openfl/component/aggregator/aggregator.py index ea04b5fe88..e1c61f8ce3 100644 --- a/openfl/component/aggregator/aggregator.py +++ b/openfl/component/aggregator/aggregator.py @@ -69,7 +69,7 @@ def __init__( best_state_path, last_state_path, assigner, - delta_updates=True, + use_delta_updates=True, straggler_handling_policy=None, rounds_to_train=256, single_col_cert_common_name=None, @@ -187,7 +187,7 @@ def __init__( # Initialize a lock for thread safety self.lock = Lock() - self.delta_updates = delta_updates + self.use_delta_updates = use_delta_updates def _load_initial_tensors(self): """Load all of the tensors required to begin federated learning. @@ -804,7 +804,7 @@ def _prepare_trained(self, tensor_name, origin, round_number, report, agg_result # Create delta and save it in TensorDB base_model_tk = TensorKey(tensor_name, origin, round_number, report, ("model",)) base_model_nparray = self.tensor_db.get_tensor_from_cache(base_model_tk) - if base_model_nparray is not None and self.delta_updates: + if base_model_nparray is not None and self.use_delta_updates: delta_tk, delta_nparray = self.tensor_codec.generate_delta( agg_tag_tk, agg_results, base_model_nparray ) @@ -833,7 +833,7 @@ def _prepare_trained(self, tensor_name, origin, round_number, report, agg_result self.tensor_db.cache_tensor({decompressed_delta_tk: decompressed_delta_nparray}) # Apply delta (unless delta couldn't be created) - if base_model_nparray is not None and self.delta_updates: + if base_model_nparray is not None and self.use_delta_updates: self.logger.debug("Applying delta for layer %s", decompressed_delta_tk[0]) new_model_tk, new_model_nparray = self.tensor_codec.apply_delta( decompressed_delta_tk, From eecffe01fccd2fbb192a6d0bd1eedda7512e9280 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 18 Nov 2024 09:03:48 -0800 Subject: [PATCH 27/40] split test data for collaborators Signed-off-by: kta-intel --- openfl-workspace/xgb_higgs/src/setup_data.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/openfl-workspace/xgb_higgs/src/setup_data.py b/openfl-workspace/xgb_higgs/src/setup_data.py index b160e96c36..116d00cf66 100644 --- a/openfl-workspace/xgb_higgs/src/setup_data.py +++ b/openfl-workspace/xgb_higgs/src/setup_data.py @@ -87,8 +87,9 @@ def main(): split_train_data = train_data.iloc[i::collaborators] split_train_data.to_csv(path.join(dst, 'train.csv'), index=False, header=False) - # Copy the test data for the current collaborator - test_data.to_csv(path.join(dst, 'valid.csv'), index=False, header=False) + # Split the test data for the current collaborator + split_test_data = test_data.iloc[i::collaborators] + split_test_data.to_csv(path.join(dst, 'valid.csv'), index=False, header=False) if __name__ == '__main__': main() From 238448fc0f73d6ec2d60e0efcffc1538a88e6206 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 18 Nov 2024 09:04:40 -0800 Subject: [PATCH 28/40] clean up methods Signed-off-by: kta-intel --- openfl/federated/data/loader_xgb.py | 84 ++--------------------------- 1 file changed, 3 insertions(+), 81 deletions(-) diff --git a/openfl/federated/data/loader_xgb.py b/openfl/federated/data/loader_xgb.py index 73f80e49e4..cb8272af98 100644 --- a/openfl/federated/data/loader_xgb.py +++ b/openfl/federated/data/loader_xgb.py @@ -2,9 +2,10 @@ import numpy as np import xgboost as xgb +from openfl.federated.data.loader import DataLoader -class XGBoostDataLoader: +class XGBoostDataLoader(DataLoader): """A class used to represent a Data Loader for XGBoost models. Attributes: @@ -44,37 +45,6 @@ def get_feature_shape(self): """ return self.X_train[0].shape - def get_train_loader(self, batch_size=None, num_batches=None): - """Returns the data loader for the training data. - - Args: - batch_size (int, optional): The batch size for the data loader - (default is None). - num_batches (int, optional): The number of batches for the data - loader (default is None). - - Returns: - generator: The generator object for the training data. - """ - return self._get_batch_generator( - X=self.X_train, - y=self.y_train, - batch_size=batch_size, - num_batches=num_batches, - ) - - def get_valid_loader(self, batch_size=None): - """Returns the data loader for the validation data. - - Args: - batch_size (int, optional): The batch size for the data loader - (default is None). - - Returns: - generator: The generator object for the validation data. - """ - return self._get_batch_generator(X=self.X_valid, y=self.y_valid, batch_size=batch_size) - def get_train_data_size(self): """Returns the total number of training samples. @@ -90,55 +60,7 @@ def get_valid_data_size(self): int: The total number of validation samples. """ return self.X_valid.shape[0] - - @staticmethod - def _batch_generator(X, y, idxs, batch_size, num_batches): - """Generates batches of data. - - Args: - X (np.array): The input data. - y (np.array): The label data. - idxs (np.array): The index of the dataset. - batch_size (int): The batch size for the data loader. - num_batches (int): The number of batches. - - Yields: - tuple: The input data and label data for each batch. - """ - for i in range(num_batches): - a = i * batch_size - b = a + batch_size - yield X[idxs[a:b]], y[idxs[a:b]] - - def _get_batch_generator(self, X, y, batch_size, num_batches=None): - """Returns the dataset generator. - - Args: - X (np.array): The input data. - y (np.array): The label data. - batch_size (int): The batch size for the data loader. - num_batches (int, optional): The number of batches (default is - None). - - Returns: - generator: The dataset generator. - """ - if batch_size is None: - batch_size = self.batch_size - - # shuffle data indices - if self.random_seed is not None: - np.random.seed(self.random_seed) - - idxs = np.random.permutation(np.arange(X.shape[0])) - - # compute the number of batches - if num_batches is None: - num_batches = ceil(X.shape[0] / batch_size) - - # build the generator and return it - return self._batch_generator(X, y, idxs, batch_size, num_batches) - + def get_dmatrix(self, X, y): """Returns the DMatrix for the given data. From 16cd7e174d3feaa3af223712e29d160f1a97bba4 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 18 Nov 2024 09:05:02 -0800 Subject: [PATCH 29/40] clean up taskrunner Signed-off-by: kta-intel --- openfl-workspace/xgb_higgs/src/taskrunner.py | 10 ++-- openfl/federated/task/runner_xgb.py | 63 +++++--------------- 2 files changed, 19 insertions(+), 54 deletions(-) diff --git a/openfl-workspace/xgb_higgs/src/taskrunner.py b/openfl-workspace/xgb_higgs/src/taskrunner.py index 520e303be2..3e030fb7ac 100644 --- a/openfl-workspace/xgb_higgs/src/taskrunner.py +++ b/openfl-workspace/xgb_higgs/src/taskrunner.py @@ -38,7 +38,7 @@ def __init__(self, params=None, num_rounds=1, **kwargs): self.params = params self.num_rounds = num_rounds - def train_(self, train_dataloader) -> Metric: + def train_(self, data) -> Metric: """ Train the XGBoost model. @@ -48,7 +48,7 @@ def train_(self, train_dataloader) -> Metric: Returns: Metric: A Metric object containing the training loss. """ - dtrain = train_dataloader['dmatrix'] + dtrain = data['dmatrix'] evals = [(dtrain, 'train')] evals_result = {} @@ -58,7 +58,7 @@ def train_(self, train_dataloader) -> Metric: loss = evals_result['train']['logloss'][-1] return Metric(name=self.params['eval_metric'], value=np.array(loss)) - def validate_(self, validation_dataloader) -> Metric: + def validate_(self, data) -> Metric: """ Validate the XGBoost model. @@ -68,8 +68,8 @@ def validate_(self, validation_dataloader) -> Metric: Returns: Metric: A Metric object containing the validation accuracy. """ - dtest = validation_dataloader['dmatrix'] - y_test = validation_dataloader['labels'] + dtest = data['dmatrix'] + y_test = data['labels'] preds = self.bst.predict(dtest) y_pred_binary = np.where(preds > 0.5, 1, 0) acc = accuracy_score(y_test, y_pred_binary) diff --git a/openfl/federated/task/runner_xgb.py b/openfl/federated/task/runner_xgb.py index 782a5b8213..5b575a8f8e 100644 --- a/openfl/federated/task/runner_xgb.py +++ b/openfl/federated/task/runner_xgb.py @@ -26,12 +26,10 @@ def __init__(self, **kwargs): Attributes: global_model (xgb.Booster): The global XGBoost model. required_tensorkeys_for_function (dict): A dictionary to store required tensor keys for each function. - training_round_completed (bool): A flag to indicate if the training round is completed. """ super().__init__(**kwargs) self.global_model = None self.required_tensorkeys_for_function = {} - self.training_round_completed = False def rebuild_model(self, input_tensor_dict): """ @@ -73,7 +71,7 @@ def validate_task(self, col_name, round_num, input_tensor_dict, **kwargs): local_output_dict (dict): Tensors to maintain in the local TensorDB. """ - loader = self.data_loader.get_valid_dmatrix() + data = self.data_loader.get_valid_dmatrix() # during agg validation, self.bst will still be None. during local validation, it will have a value - no need to rebuild if self.bst is None: @@ -85,7 +83,7 @@ def validate_task(self, col_name, round_num, input_tensor_dict, **kwargs): # TODO: this is not robust, especially if using a loss metric metric = Metric(name="accuracy", value=np.array(0)) else: - metric = self.validate_(loader) + metric = self.validate_(data) origin = col_name suffix = "validate" @@ -95,7 +93,7 @@ def validate_task(self, col_name, round_num, input_tensor_dict, **kwargs): suffix += "_agg" tags = ("metric",) tags = change_tags(tags, add_field=suffix) - # TODO figure out a better way to pass in metric for this pytorch + # validate function output_tensor_dict = {TensorKey(metric.name, origin, round_num, True, tags): metric.value} @@ -125,8 +123,8 @@ def train_task( TensorDB. """ self.rebuild_model(input_tensor_dict) - loader = self.data_loader.get_train_dmatrix() - metric = self.train_(loader) + data = self.data_loader.get_train_dmatrix() + metric = self.train_(data) # Output metric tensors (scalar) origin = col_name tags = ("trained",) @@ -167,26 +165,6 @@ def train_task( **next_local_tensorkey_model_dict, } - # Update the required tensors if they need to be pulled from the - # aggregator - # TODO this logic can break if different collaborators have different - # roles between rounds. - # For example, if a collaborator only performs validation in the first - # round but training in the second, it has no way of knowing the - # optimizer state tensor names to request from the aggregator because - # these are only created after training occurs. A work around could - # involve doing a single epoch of training on random data to get the - # optimizer names, and then throwing away the model. - if self.opt_treatment == "CONTINUE_GLOBAL": - self.initialize_tensorkeys_for_functions() - - # This will signal that the optimizer values are now present, - # and can be loaded when the model is rebuilt - self.training_round_completed = True - - # Return global_tensor_dict, local_tensor_dict - # import pdb; pdb.set_trace() - # TODO it is still decodable from here with .tobytes().decode('utf-8') return global_tensor_dict, local_tensor_dict def get_tensor_dict(self, with_opt_vars=False): @@ -266,30 +244,17 @@ def initialize_tensorkeys_for_functions(self, with_opt_vars=False): Custom tensors should be added to this function. Args: - with_opt_vars (bool): Flag to check if optimizer variables are - included. Defaults to False. + with_opt_vars (bool): with_opt_vars (bool): N/A for XGBoost (Default=False). Returns: None """ - # TODO there should be a way to programmatically iterate through - # all of the methods in the class and declare the tensors. - # For now this is done manually - - output_model_dict = self.get_tensor_dict(with_opt_vars=with_opt_vars) + output_model_dict = self.get_tensor_dict() global_model_dict, local_model_dict = split_tensor_dict_for_holdouts( self.logger, output_model_dict, **self.tensor_dict_split_fn_kwargs ) - if not with_opt_vars: - global_model_dict_val = global_model_dict - local_model_dict_val = local_model_dict - else: - output_model_dict = self.get_tensor_dict(with_opt_vars=False) - global_model_dict_val, local_model_dict_val = split_tensor_dict_for_holdouts( - self.logger, - output_model_dict, - **self.tensor_dict_split_fn_kwargs, - ) + global_model_dict_val = global_model_dict + local_model_dict_val = local_model_dict self.required_tensorkeys_for_function["train_task"] = [ TensorKey(tensor_name, "GLOBAL", 0, False, ("model",)) @@ -354,7 +319,7 @@ def save_native( """ self.bst.save_model(filepath) - def train_(self, train_dataloader) -> Metric: + def train_(self, data) -> Metric: """ Train the XGBoost model. @@ -364,7 +329,7 @@ def train_(self, train_dataloader) -> Metric: Returns: Metric: A Metric object containing the training loss. """ - dtrain = train_dataloader["dmatrix"] + dtrain = data["dmatrix"] evals = [(dtrain, "train")] evals_result = {} @@ -381,7 +346,7 @@ def train_(self, train_dataloader) -> Metric: loss = evals_result["train"]["logloss"][-1] return Metric(name=self.loss_fn.__name__, value=np.array(loss)) - def validate_(self, validation_dataloader) -> Metric: + def validate_(self, data) -> Metric: """ Validate the XGBoost model. @@ -391,8 +356,8 @@ def validate_(self, validation_dataloader) -> Metric: Returns: Metric: A Metric object containing the validation accuracy. """ - dtest = validation_dataloader["dmatrix"] - y_test = validation_dataloader["labels"] + dtest = data["dmatrix"] + y_test = data["labels"] preds = self.bst.predict(dtest) y_pred_binary = np.where(preds > 0.5, 1, 0) acc = accuracy_score(y_test, y_pred_binary) From 4c03932c3ef4cc9c11716bf9821894015c182951 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 18 Nov 2024 10:24:13 -0800 Subject: [PATCH 30/40] remove conditional for unused condition Signed-off-by: kta-intel --- openfl/federated/task/runner_xgb.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/openfl/federated/task/runner_xgb.py b/openfl/federated/task/runner_xgb.py index 5b575a8f8e..27ea9486ad 100644 --- a/openfl/federated/task/runner_xgb.py +++ b/openfl/federated/task/runner_xgb.py @@ -36,23 +36,15 @@ def rebuild_model(self, input_tensor_dict): Rebuilds the model using the provided input tensor dictionary. This method checks if the 'local_tree' key in the input tensor dictionary is either a non-empty numpy array - or a non-None value. If this condition is met, it updates the internal tensor dictionary with the provided input. + If this condition is met, it updates the internal tensor dictionary with the provided input. Parameters: - input_tensor_dict (dict): A dictionary containing tensor data. It must include the key 'local_tree', which can be: - - A non-empty numpy array - - Any non-None value + input_tensor_dict (dict): A dictionary containing tensor data. It must include the key 'local_tree' Returns: None """ - if ( - isinstance(input_tensor_dict["local_tree"], np.ndarray) - and input_tensor_dict["local_tree"].size != 0 - ) or ( - not isinstance(input_tensor_dict["local_tree"], np.ndarray) - and input_tensor_dict["local_tree"] is not None - ): + if isinstance(input_tensor_dict["local_tree"], np.ndarray) and input_tensor_dict["local_tree"].size != 0 : self.set_tensor_dict(input_tensor_dict) def validate_task(self, col_name, round_num, input_tensor_dict, **kwargs): @@ -68,8 +60,7 @@ def validate_task(self, col_name, round_num, input_tensor_dict, **kwargs): Returns: global_output_dict (dict): Tensors to send back to the aggregator. - local_output_dict (dict): Tensors to maintain in the local - TensorDB. + local_output_dict (dict): Tensors to maintain in the local TensorDB. """ data = self.data_loader.get_valid_dmatrix() From 6aa983867a1b8ea6889a8febe8265a45f0383651 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 18 Nov 2024 11:44:37 -0800 Subject: [PATCH 31/40] add conversion check Signed-off-by: kta-intel --- openfl/federated/task/runner_xgb.py | 23 +++++++++++++++++++ .../aggregation_functions/fed_bagging.py | 11 +++++---- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/openfl/federated/task/runner_xgb.py b/openfl/federated/task/runner_xgb.py index 27ea9486ad..cda2563005 100644 --- a/openfl/federated/task/runner_xgb.py +++ b/openfl/federated/task/runner_xgb.py @@ -15,6 +15,27 @@ from openfl.utilities.split import split_tensor_dict_for_holdouts +def check_precision_loss(logger, converted_data, original_data): + """ + Checks for precision loss during conversion to float32 and back. + + Parameters: + logger (Logger): The logger object to log warnings. + converted_data (np.ndarray): The data that has been converted to float32. + original_data (list): The original data to be checked for precision loss. + """ + # Convert the float32 array back to bytes and decode to JSON + reconstructed_bytes = converted_data.astype(np.uint8).tobytes() + reconstructed_json = reconstructed_bytes.decode("utf-8") + reconstructed_data = json.loads(reconstructed_json) + + assert type(original_data) == type(reconstructed_data), "Reconstructed datatype does not match original." + + # Compare the original and reconstructed data + if original_data != reconstructed_data: + logger.warn("Precision loss detected during conversion.") + + class XGBoostTaskRunner(TaskRunner): def __init__(self, **kwargs): """ @@ -209,6 +230,8 @@ def get_tensor_dict(self, with_opt_vars=False): np.float32 ) + check_precision_loss(self.logger, latest_trees_float32_array, original_data=latest_trees) + return {"local_tree": latest_trees_float32_array} def get_required_tensorkeys_for_function(self, func_name, **kwargs): diff --git a/openfl/interface/aggregation_functions/fed_bagging.py b/openfl/interface/aggregation_functions/fed_bagging.py index aaec86fb24..081fa91a2e 100644 --- a/openfl/interface/aggregation_functions/fed_bagging.py +++ b/openfl/interface/aggregation_functions/fed_bagging.py @@ -5,11 +5,11 @@ """Federated Boostrap Aggregation for XGBoost module.""" import json - +from logging import getLogger import numpy as np from openfl.interface.aggregation_functions.core import AggregationFunction - +from openfl.federated.task.runner_xgb import check_precision_loss def get_global_model(iterator, target_round): """ @@ -95,7 +95,7 @@ def call(self, local_tensors, db_iterator, tensor_name, fl_round, *_): Returns: bytearray: aggregated tensor """ - + logger = getLogger(__name__) global_model = get_global_model(db_iterator, fl_round) if ( @@ -127,4 +127,7 @@ def call(self, local_tensors, db_iterator, tensor_name, fl_round, *_): global_model_json = json.dumps(global_model) global_model_bytes = global_model_json.encode("utf-8") - return np.frombuffer(global_model_bytes, dtype=np.uint8).astype(np.float32) + global_model_float32_array = np.frombuffer(global_model_bytes, dtype=np.uint8).astype(np.float32) + check_precision_loss(logger, global_model_float32_array, global_model) + + return global_model_float32_array From ac2a9256dfe5eeb5491e59690a14f813781ae804 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 18 Nov 2024 12:14:32 -0800 Subject: [PATCH 32/40] set global model attribute to np array for consistency Signed-off-by: kta-intel --- openfl/federated/task/runner_xgb.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/openfl/federated/task/runner_xgb.py b/openfl/federated/task/runner_xgb.py index cda2563005..f65b1039fc 100644 --- a/openfl/federated/task/runner_xgb.py +++ b/openfl/federated/task/runner_xgb.py @@ -208,7 +208,8 @@ def get_tensor_dict(self, with_opt_vars=False): booster_float32_array = np.frombuffer(booster_array, dtype=np.uint8).astype(np.float32) return {"local_tree": booster_float32_array} - global_model_booster_dict = json.loads(self.global_model) + global_model_byte_array = bytearray(self.global_model.astype(np.uint8).tobytes()) + global_model_booster_dict = json.loads(global_model_byte_array) num_global_trees = int( global_model_booster_dict["learner"]["gradient_booster"]["model"]["gbtree_model_param"][ "num_trees" @@ -313,9 +314,10 @@ def set_tensor_dict(self, tensor_dict, with_opt_vars=False): with_opt_vars (bool): N/A for XGBoost (Default=False). """ # The with_opt_vars argument is not used in this method - self.global_model = bytearray(tensor_dict["local_tree"].astype(np.uint8).tobytes()) + self.global_model = tensor_dict["local_tree"] + global_model_byte_array = bytearray(self.global_model.astype(np.uint8).tobytes()) self.bst = xgb.Booster() - self.bst.load_model(self.global_model) + self.bst.load_model(global_model_byte_array) def save_native( self, From d65def13c22736bd16cfdcfe167c936c5ad9353f Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 18 Nov 2024 12:29:07 -0800 Subject: [PATCH 33/40] raise value error when model is empty when trying to set tensor dict Signed-off-by: kta-intel --- openfl/federated/task/runner_xgb.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/openfl/federated/task/runner_xgb.py b/openfl/federated/task/runner_xgb.py index f65b1039fc..5169335614 100644 --- a/openfl/federated/task/runner_xgb.py +++ b/openfl/federated/task/runner_xgb.py @@ -315,9 +315,14 @@ def set_tensor_dict(self, tensor_dict, with_opt_vars=False): """ # The with_opt_vars argument is not used in this method self.global_model = tensor_dict["local_tree"] - global_model_byte_array = bytearray(self.global_model.astype(np.uint8).tobytes()) - self.bst = xgb.Booster() - self.bst.load_model(global_model_byte_array) + if ( + isinstance(self.global_model, np.ndarray) and self.global_model.size == 0 + ) or self.global_model is None: + raise ValueError("The model does not exist or is empty.") + else: + global_model_byte_array = bytearray(self.global_model.astype(np.uint8).tobytes()) + self.bst = xgb.Booster() + self.bst.load_model(global_model_byte_array) def save_native( self, From 63be8746588e49fbebf0abaa8ddb7d9d65e5e19c Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 18 Nov 2024 13:23:14 -0800 Subject: [PATCH 34/40] remove conversion checker to avoid circular import issue Signed-off-by: kta-intel --- openfl/interface/aggregation_functions/fed_bagging.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/openfl/interface/aggregation_functions/fed_bagging.py b/openfl/interface/aggregation_functions/fed_bagging.py index 081fa91a2e..4c05fd0829 100644 --- a/openfl/interface/aggregation_functions/fed_bagging.py +++ b/openfl/interface/aggregation_functions/fed_bagging.py @@ -9,7 +9,6 @@ import numpy as np from openfl.interface.aggregation_functions.core import AggregationFunction -from openfl.federated.task.runner_xgb import check_precision_loss def get_global_model(iterator, target_round): """ @@ -128,6 +127,5 @@ def call(self, local_tensors, db_iterator, tensor_name, fl_round, *_): global_model_bytes = global_model_json.encode("utf-8") global_model_float32_array = np.frombuffer(global_model_bytes, dtype=np.uint8).astype(np.float32) - check_precision_loss(logger, global_model_float32_array, global_model) return global_model_float32_array From b346b24b4c757840213a3b01fe1c898c60171cfd Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 18 Nov 2024 13:28:40 -0800 Subject: [PATCH 35/40] add docstring and more descriptive comments Signed-off-by: kta-intel --- .../interface/aggregation_functions/fed_bagging.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/openfl/interface/aggregation_functions/fed_bagging.py b/openfl/interface/aggregation_functions/fed_bagging.py index 4c05fd0829..a5eed3dd06 100644 --- a/openfl/interface/aggregation_functions/fed_bagging.py +++ b/openfl/interface/aggregation_functions/fed_bagging.py @@ -5,9 +5,7 @@ """Federated Boostrap Aggregation for XGBoost module.""" import json -from logging import getLogger import numpy as np - from openfl.interface.aggregation_functions.core import AggregationFunction def get_global_model(iterator, target_round): @@ -59,8 +57,13 @@ def append_trees(global_model, local_trees): class FedBaggingXGBoost(AggregationFunction): - """Federated Boostrap Aggregation for XGBoost.""" + """ + Federated Bootstrap Aggregation for XGBoost. + This class implements a federated learning aggregation function specifically + designed for XGBoost models. It aggregates local model updates (trees) from + multiple collaborators into a global model using a bagging approach. + """ def call(self, local_tensors, db_iterator, tensor_name, fl_round, *_): """Aggregate tensors. @@ -94,12 +97,12 @@ def call(self, local_tensors, db_iterator, tensor_name, fl_round, *_): Returns: bytearray: aggregated tensor """ - logger = getLogger(__name__) global_model = get_global_model(db_iterator, fl_round) if ( isinstance(global_model, np.ndarray) and global_model.size == 0 ) or global_model is None: + # if there is no global model, use the first local model as the global model for local_tensor in local_tensors: local_tree_bytearray = bytearray(local_tensor.tensor.astype(np.uint8).tobytes()) local_tree_json = json.loads(local_tree_bytearray) @@ -116,9 +119,11 @@ def call(self, local_tensors, db_iterator, tensor_name, fl_round, *_): global_model = append_trees(global_model, local_trees) else: global_model_bytearray = bytearray(global_model.astype(np.uint8).tobytes()) + # convert the global model to a dictionary global_model = json.loads(global_model_bytearray) for local_tensor in local_tensors: + # append trees to global model local_tree_bytearray = bytearray(local_tensor.tensor.astype(np.uint8).tobytes()) local_trees = json.loads(local_tree_bytearray) global_model = append_trees(global_model, local_trees) From 809b69b2c979f7019fb6600ace5305faaf1e9e9e Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 18 Nov 2024 13:29:57 -0800 Subject: [PATCH 36/40] formatting fix Signed-off-by: kta-intel --- openfl-workspace/xgb_higgs/requirements.txt | 2 +- openfl-workspace/xgb_higgs/src/dataloader.py | 4 ++-- openfl/federated/data/loader_xgb.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/openfl-workspace/xgb_higgs/requirements.txt b/openfl-workspace/xgb_higgs/requirements.txt index 797917eff2..26a78d72ac 100644 --- a/openfl-workspace/xgb_higgs/requirements.txt +++ b/openfl-workspace/xgb_higgs/requirements.txt @@ -1,3 +1,3 @@ +modin[all] scikit-learn xgboost -modin[all] diff --git a/openfl-workspace/xgb_higgs/src/dataloader.py b/openfl-workspace/xgb_higgs/src/dataloader.py index 47416cbefa..550ddcc47d 100644 --- a/openfl-workspace/xgb_higgs/src/dataloader.py +++ b/openfl-workspace/xgb_higgs/src/dataloader.py @@ -35,7 +35,7 @@ def load_Higgs(data_path, **kwargs): Load the Higgs dataset from CSV files. The dataset is expected to be in two CSV files: 'train.csv' and 'valid.csv'. - The first column in each file represents the labels, and the remaining + The first column in each file represents the labels, and the remaining columns represent the features. Args: @@ -57,4 +57,4 @@ def load_Higgs(data_path, **kwargs): X_valid = valid_data.iloc[:, 1:].values y_valid = valid_data.iloc[:, 0].values - return X_train, y_train, X_valid, y_valid \ No newline at end of file + return X_train, y_train, X_valid, y_valid diff --git a/openfl/federated/data/loader_xgb.py b/openfl/federated/data/loader_xgb.py index cb8272af98..ad12758a32 100644 --- a/openfl/federated/data/loader_xgb.py +++ b/openfl/federated/data/loader_xgb.py @@ -1,4 +1,4 @@ -from math import ceil +pass import numpy as np import xgboost as xgb @@ -60,7 +60,7 @@ def get_valid_data_size(self): int: The total number of validation samples. """ return self.X_valid.shape[0] - + def get_dmatrix(self, X, y): """Returns the DMatrix for the given data. From cf67f62a65a2b9066120d99e667bf1d175e81571 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 18 Nov 2024 13:41:55 -0800 Subject: [PATCH 37/40] fixing import sorting Signed-off-by: kta-intel --- openfl/federated/data/loader_xgb.py | 4 +--- openfl/interface/aggregation_functions/fed_bagging.py | 3 +++ 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/openfl/federated/data/loader_xgb.py b/openfl/federated/data/loader_xgb.py index ad12758a32..46087392b4 100644 --- a/openfl/federated/data/loader_xgb.py +++ b/openfl/federated/data/loader_xgb.py @@ -1,7 +1,5 @@ -pass - -import numpy as np import xgboost as xgb + from openfl.federated.data.loader import DataLoader diff --git a/openfl/interface/aggregation_functions/fed_bagging.py b/openfl/interface/aggregation_functions/fed_bagging.py index a5eed3dd06..5bd75fbcdb 100644 --- a/openfl/interface/aggregation_functions/fed_bagging.py +++ b/openfl/interface/aggregation_functions/fed_bagging.py @@ -5,9 +5,12 @@ """Federated Boostrap Aggregation for XGBoost module.""" import json + import numpy as np + from openfl.interface.aggregation_functions.core import AggregationFunction + def get_global_model(iterator, target_round): """ Retrieves the global model for the specific round from an iterator. From acb89d56cb9a9925101e00de00205ba43829b533 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 18 Nov 2024 13:47:38 -0800 Subject: [PATCH 38/40] format fix Signed-off-by: kta-intel --- openfl/federated/task/runner_xgb.py | 9 +++++++-- openfl/interface/aggregation_functions/fed_bagging.py | 5 ++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/openfl/federated/task/runner_xgb.py b/openfl/federated/task/runner_xgb.py index 5169335614..a1da3e8e5c 100644 --- a/openfl/federated/task/runner_xgb.py +++ b/openfl/federated/task/runner_xgb.py @@ -29,7 +29,9 @@ def check_precision_loss(logger, converted_data, original_data): reconstructed_json = reconstructed_bytes.decode("utf-8") reconstructed_data = json.loads(reconstructed_json) - assert type(original_data) == type(reconstructed_data), "Reconstructed datatype does not match original." + assert type(original_data) == type( + reconstructed_data + ), "Reconstructed datatype does not match original." # Compare the original and reconstructed data if original_data != reconstructed_data: @@ -65,7 +67,10 @@ def rebuild_model(self, input_tensor_dict): Returns: None """ - if isinstance(input_tensor_dict["local_tree"], np.ndarray) and input_tensor_dict["local_tree"].size != 0 : + if ( + isinstance(input_tensor_dict["local_tree"], np.ndarray) + and input_tensor_dict["local_tree"].size != 0 + ): self.set_tensor_dict(input_tensor_dict) def validate_task(self, col_name, round_num, input_tensor_dict, **kwargs): diff --git a/openfl/interface/aggregation_functions/fed_bagging.py b/openfl/interface/aggregation_functions/fed_bagging.py index 5bd75fbcdb..2e42072c66 100644 --- a/openfl/interface/aggregation_functions/fed_bagging.py +++ b/openfl/interface/aggregation_functions/fed_bagging.py @@ -67,6 +67,7 @@ class FedBaggingXGBoost(AggregationFunction): designed for XGBoost models. It aggregates local model updates (trees) from multiple collaborators into a global model using a bagging approach. """ + def call(self, local_tensors, db_iterator, tensor_name, fl_round, *_): """Aggregate tensors. @@ -134,6 +135,8 @@ def call(self, local_tensors, db_iterator, tensor_name, fl_round, *_): global_model_json = json.dumps(global_model) global_model_bytes = global_model_json.encode("utf-8") - global_model_float32_array = np.frombuffer(global_model_bytes, dtype=np.uint8).astype(np.float32) + global_model_float32_array = np.frombuffer(global_model_bytes, dtype=np.uint8).astype( + np.float32 + ) return global_model_float32_array From 5794b703eea27bb8d09cbb4bbfc603f27e9f762d Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 18 Nov 2024 13:48:06 -0800 Subject: [PATCH 39/40] remove unnecessarly files Signed-off-by: kta-intel --- openfl-workspace/xgb_higgs/.workspace | 1 - openfl-workspace/xgb_higgs/plan/defaults | 1 - 2 files changed, 2 deletions(-) delete mode 100644 openfl-workspace/xgb_higgs/.workspace delete mode 100644 openfl-workspace/xgb_higgs/plan/defaults diff --git a/openfl-workspace/xgb_higgs/.workspace b/openfl-workspace/xgb_higgs/.workspace deleted file mode 100644 index 520b5e57c1..0000000000 --- a/openfl-workspace/xgb_higgs/.workspace +++ /dev/null @@ -1 +0,0 @@ -current_plan_name: default diff --git a/openfl-workspace/xgb_higgs/plan/defaults b/openfl-workspace/xgb_higgs/plan/defaults deleted file mode 100644 index 5042bedbcf..0000000000 --- a/openfl-workspace/xgb_higgs/plan/defaults +++ /dev/null @@ -1 +0,0 @@ -../../workspace/plan/defaults \ No newline at end of file From 34f7d8abcf725ae8d6643363439283138fbf1fa4 Mon Sep 17 00:00:00 2001 From: kta-intel Date: Mon, 18 Nov 2024 13:57:21 -0800 Subject: [PATCH 40/40] format fix, comparing datatype Signed-off-by: kta-intel --- openfl/federated/task/runner_xgb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openfl/federated/task/runner_xgb.py b/openfl/federated/task/runner_xgb.py index a1da3e8e5c..222b8c613f 100644 --- a/openfl/federated/task/runner_xgb.py +++ b/openfl/federated/task/runner_xgb.py @@ -29,7 +29,7 @@ def check_precision_loss(logger, converted_data, original_data): reconstructed_json = reconstructed_bytes.decode("utf-8") reconstructed_data = json.loads(reconstructed_json) - assert type(original_data) == type( + assert type(original_data) is type( reconstructed_data ), "Reconstructed datatype does not match original."