securefederatedai · teoparvanov · Nov 20, 2024 · Nov 8, 2024 · Nov 8, 2024 · Nov 12, 2024
diff --git a/openfl-workspace/workspace/plan/defaults/aggregator.yaml b/openfl-workspace/workspace/plan/defaults/aggregator.yaml
@@ -1,4 +1,4 @@
 template : openfl.component.Aggregator
 settings :
-    db_store_rounds   : 2
-    write_logs : true
+    db_store_rounds : 2
+    write_logs      : true
diff --git a/openfl-workspace/workspace/plan/defaults/tasks_xgb.yaml b/openfl-workspace/workspace/plan/defaults/tasks_xgb.yaml
@@ -0,0 +1,21 @@
+aggregated_model_validation:
+  function : validate_task
+  kwargs :
+    apply : global
+    metrics :
+      - acc
+
+locally_tuned_model_validation:
+  function : validate_task
+  kwargs :
+    apply : local
+    metrics :
+      - acc
+
+train:
+  function : train_task
+  kwargs :
+    metrics :
+    - loss
+  aggregation_type :
+      template : openfl.interface.aggregation_functions.FedBaggingXGBoost  
diff --git a/openfl-workspace/xgb_higgs/.workspace b/openfl-workspace/xgb_higgs/.workspace
@@ -0,0 +1 @@
+current_plan_name: default
diff --git a/openfl-workspace/xgb_higgs/plan/cols.yaml b/openfl-workspace/xgb_higgs/plan/cols.yaml
@@ -0,0 +1,5 @@
+# Copyright (C) 2024 Intel Corporation
+# Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you.
+
+# This file lists the collaborators associated with the federation. The list will be auto-populated during collaborator creation.
+collaborators:
diff --git a/openfl-workspace/xgb_higgs/plan/data.yaml b/openfl-workspace/xgb_higgs/plan/data.yaml
@@ -0,0 +1,5 @@
+# Copyright (C) 2024 Intel Corporation
+# Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you.
+
+# This file specifies the local data directory associated with the respective collaborator. This will be auto-populated during collaborator creation
+# collaborator_name,data_directory_path
diff --git a/openfl-workspace/xgb_higgs/plan/defaults b/openfl-workspace/xgb_higgs/plan/defaults
@@ -0,0 +1 @@
+../../workspace/plan/defaults
diff --git a/openfl-workspace/xgb_higgs/plan/plan.yaml b/openfl-workspace/xgb_higgs/plan/plan.yaml
@@ -0,0 +1,51 @@
+# Copyright (C) 2024 Intel Corporation
+# Licensed subject to the terms of the separately executed evaluation license agreement between Intel Corporation and you.
+
+aggregator :
+  defaults : plan/defaults/aggregator.yaml
+  template : openfl.component.aggregator.Aggregator
+  settings :
+    init_state_path : save/init.pbuf
+    best_state_path : save/best.pbuf
+    last_state_path : save/last.pbuf
+    rounds_to_train : 10
+    write_logs      : false
+    delta_updates   : false
+
+collaborator :
+  defaults : plan/defaults/collaborator.yaml
+  template : openfl.component.collaborator.Collaborator
+  settings :
+    delta_updates    : false
+    opt_treatment    : RESET
+
+data_loader :
+  defaults : plan/defaults/data_loader.yaml
+  template : src.dataloader.HiggsDataLoader
+  settings :
+    input_shape : 28
+
+task_runner :
+  defaults : plan/defaults/task_runner.yaml
+  template : src.taskrunner.XGBoostRunner
+  settings :
+    params :
+      objective: binary:logistic
+      eval_metric: logloss
+      max_depth: 6
+      eta: 0.3
+      num_parallel_tree: 1
+
+network :
+  defaults : plan/defaults/network.yaml
+  settings :
+    {}
+
+assigner :
+  defaults : plan/defaults/assigner.yaml
+
+tasks :
+  defaults : plan/defaults/tasks_xgb.yaml
+
+compression_pipeline :
+  defaults : plan/defaults/compression_pipeline.yaml
diff --git a/openfl-workspace/xgb_higgs/requirements.txt b/openfl-workspace/xgb_higgs/requirements.txt
@@ -0,0 +1,3 @@
+scikit-learn
+xgboost
+modin[all]
diff --git a/openfl-workspace/xgb_higgs/src/__init__.py b/openfl-workspace/xgb_higgs/src/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
diff --git a/openfl-workspace/xgb_higgs/src/dataloader.py b/openfl-workspace/xgb_higgs/src/dataloader.py
@@ -0,0 +1,30 @@
+# Copyright (C) 2024 Intel Corporation
+# Licensed subject to the terms of the separately executed evaluation license agreement between
+# Intel Corporation and you.
+
+from openfl.federated import XGBoostDataLoader
+import os
+import modin.pandas as pd
+
+class HiggsDataLoader(XGBoostDataLoader):
+    def __init__(self, data_path, **kwargs):
+        super().__init__(**kwargs)
+        X_train, y_train, X_valid, y_valid = load_Higgs(
+            data_path, **kwargs
+        )
+        self.X_train = X_train
+        self.y_train = y_train
+        self.X_valid = X_valid
+        self.y_valid = y_valid
+
+
+def load_Higgs(data_path, **kwargs):
+    train_data = pd.read_csv(os.path.join(data_path, 'train.csv'), header=None)
+    X_train = train_data.iloc[:, 1:].values
+    y_train = train_data.iloc[:, 0].values
+
+    valid_data = pd.read_csv(os.path.join(data_path, 'valid.csv'), header=None)
+    X_valid = valid_data.iloc[:, 1:].values
+    y_valid = valid_data.iloc[:, 0].values
+
+    return X_train, y_train, X_valid, y_valid
diff --git a/openfl-workspace/xgb_higgs/src/setup_data.py b/openfl-workspace/xgb_higgs/src/setup_data.py
@@ -0,0 +1,94 @@
+import sys
+import os
+import shutil
+from logging import getLogger
+from urllib.request import urlretrieve
+from hashlib import sha384
+from os import path, makedirs
+from tqdm import tqdm
+import modin.pandas as pd
+import gzip
+from sklearn.model_selection import train_test_split
+import numpy as np
+
+logger = getLogger(__name__)
+
+"""HIGGS Dataset."""
+
+URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"
+FILENAME = "HIGGS.csv.gz"
+CSV_FILENAME = "HIGGS.csv"
+CSV_SHA384 = 'b8b82e11a78b81601381420878ad42ba557291f394a88dc5293e4077c8363c87429639b120e299a2a9939c1f943b6a63'
+DEFAULT_PATH = path.join(path.expanduser('~'), '.openfl', 'data')
+
+pbar = tqdm(total=None)
+
+def report_hook(count, block_size, total_size):
+    """Update progressbar."""
+    if pbar.total is None and total_size:
+        pbar.total = total_size
+    progress_bytes = count * block_size
+    pbar.update(progress_bytes - pbar.n)
+
+def verify_sha384(file_path, expected_hash):
+    """Verify the SHA-384 hash of a file."""
+    sha384_hash = sha384()
+    with open(file_path, 'rb') as f:
+        for byte_block in iter(lambda: f.read(4096), b""):
+            sha384_hash.update(byte_block)
+    computed_hash = sha384_hash.hexdigest()
+    if computed_hash != expected_hash:
+        raise ValueError(f"SHA-384 hash mismatch: expected {expected_hash}, got {computed_hash}")
+    print(f"SHA-384 hash verified: {computed_hash}")
+
+def setup_data(root: str = DEFAULT_PATH, **kwargs):
+    """Initialize."""
+    makedirs(root, exist_ok=True)
+    filepath = path.join(root, FILENAME)
+    csv_filepath = path.join(root, CSV_FILENAME)
+    if not path.exists(filepath):
+        urlretrieve(URL, filepath, report_hook)  # nosec
+        verify_sha384(filepath, CSV_SHA384)
+        # Extract the CSV file from the gzip file
+        with gzip.open(filepath, 'rb') as f_in:
+            with open(csv_filepath, 'wb') as f_out:
+                shutil.copyfileobj(f_in, f_out)
+
+def main():
+    if len(sys.argv) < 2:
+        raise ValueError("Provide the number of collaborators")
+    src = 'higgs_data'
+    if os.path.exists(src):
+        shutil.rmtree(src)
+    setup_data(src)
+    collaborators = int(sys.argv[1])
+    print("Creating splits for {} collaborators".format(collaborators))
+
+    # Load the dataset
+    higgs_data = pd.read_csv(path.join(src, CSV_FILENAME), header=None)
+
+    # Split the dataset into features and labels
+    X = higgs_data.iloc[:, 1:].values
+    y = higgs_data.iloc[:, 0].values
+
+    # Split the dataset into training and testing sets
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+    # Combine X and y for train and test sets
+    train_data = pd.DataFrame(data=np.column_stack((y_train, X_train)))
+    test_data = pd.DataFrame(data=np.column_stack((y_test, X_test)))
+
+    # Split the training data into parts for each collaborator
+    for i in range(collaborators):
+        dst = f'data/{i+1}'
+        makedirs(dst, exist_ok=True)
+
+        # Split the training data for the current collaborator
+        split_train_data = train_data.iloc[i::collaborators]
+        split_train_data.to_csv(path.join(dst, 'train.csv'), index=False, header=False)
+
+        # Copy the test data for the current collaborator
+        test_data.to_csv(path.join(dst, 'valid.csv'), index=False, header=False)
+
+if __name__ == '__main__':
+    main()
diff --git a/openfl-workspace/xgb_higgs/src/taskrunner.py b/openfl-workspace/xgb_higgs/src/taskrunner.py
@@ -0,0 +1,56 @@
+# Copyright (C) 2024 Intel Corporation
+# Licensed subject to the terms of the separately executed evaluation license agreement between
+# Intel Corporation and you.
+
+"""You may copy this file as the starting point of your own model."""
+import numpy as np
+import xgboost as xgb
+
+from openfl.federated import XGBoostTaskRunner
+from openfl.utilities import Metric
+from sklearn.metrics import accuracy_score
+
+
+class XGBoostRunner(XGBoostTaskRunner):
+    """
+    Simple CNN for classification.
+
+    PyTorchTaskRunner inherits from nn.module, so you can define your model
+    in the same way that you would for PyTorch
+    """
+
+    def __init__(self, params=None, num_rounds=1, **kwargs):
+        """Initialize.
+
+        Args:
+            **kwargs: Additional arguments to pass to the function
+
+        """
+        super().__init__(**kwargs)
+
+        self.bst = None
+        self.params = params
+        self.num_rounds = num_rounds
+
+    def train_(self, train_dataloader) -> Metric:
+        """Train model."""
+        dtrain = train_dataloader['dmatrix']
+        evals = [(dtrain, 'train')]
+        evals_result = {}
+
+        self.bst = xgb.train(self.params, dtrain, self.num_rounds, xgb_model=self.bst,
+                             evals=evals, evals_result=evals_result, verbose_eval=False)
+
+        loss = evals_result['train']['logloss'][-1]
+        return Metric(name=self.params['eval_metric'], value=np.array(loss))
+
+    def validate_(self, validation_dataloader) -> Metric:
+        """Validate model."""
+
+        dtest = validation_dataloader['dmatrix']
+        y_test = validation_dataloader['labels']
+        preds = self.bst.predict(dtest)
+        y_pred_binary = np.where(preds > 0.5, 1, 0)
+        acc = accuracy_score(y_test, y_pred_binary)
+
+        return Metric(name="accuracy", value=np.array(acc))
diff --git a/openfl/component/aggregator/aggregator.py b/openfl/component/aggregator/aggregator.py
@@ -69,6 +69,7 @@ def __init__(
         best_state_path,
         last_state_path,
         assigner,
+        delta_updates=True,
         straggler_handling_policy=None,
         rounds_to_train=256,
         single_col_cert_common_name=None,
@@ -186,6 +187,8 @@ def __init__(
         # Initialize a lock for thread safety
         self.lock = Lock()
 
+        self.delta_updates = delta_updates
+
     def _load_initial_tensors(self):
         """Load all of the tensors required to begin federated learning.
 
@@ -801,7 +804,7 @@ def _prepare_trained(self, tensor_name, origin, round_number, report, agg_result
         # Create delta and save it in TensorDB
         base_model_tk = TensorKey(tensor_name, origin, round_number, report, ("model",))
         base_model_nparray = self.tensor_db.get_tensor_from_cache(base_model_tk)
-        if base_model_nparray is not None:
+        if base_model_nparray is not None and self.delta_updates:
             delta_tk, delta_nparray = self.tensor_codec.generate_delta(
                 agg_tag_tk, agg_results, base_model_nparray
             )
@@ -830,7 +833,7 @@ def _prepare_trained(self, tensor_name, origin, round_number, report, agg_result
         self.tensor_db.cache_tensor({decompressed_delta_tk: decompressed_delta_nparray})
 
         # Apply delta (unless delta couldn't be created)
-        if base_model_nparray is not None:
+        if base_model_nparray is not None and self.delta_updates:
             self.logger.debug("Applying delta for layer %s", decompressed_delta_tk[0])
             new_model_tk, new_model_nparray = self.tensor_codec.apply_delta(
                 decompressed_delta_tk,

diff --git a/openfl/federated/__init__.py b/openfl/federated/__init__.py
@@ -20,6 +20,11 @@
     from openfl.federated.data import PyTorchDataLoader
     from openfl.federated.task import FederatedModel  # NOQA
     from openfl.federated.task import PyTorchTaskRunner
+if importlib.util.find_spec("xgboost") is not None:
+    from openfl.federated.data import FederatedDataSet  # NOQA
+    from openfl.federated.data import XGBoostDataLoader
+    from openfl.federated.task import FederatedModel  # NOQA
+    from openfl.federated.task import XGBoostTaskRunner
 
 __all__ = [
     "Plan",

diff --git a/openfl/federated/data/__init__.py b/openfl/federated/data/__init__.py
@@ -23,3 +23,7 @@
 if importlib.util.find_spec("torch") is not None:
     from openfl.federated.data.federated_data import FederatedDataSet  # NOQA
     from openfl.federated.data.loader_pt import PyTorchDataLoader  # NOQA
+
+if importlib.util.find_spec("xgboost") is not None:
+    from openfl.federated.data.federated_data import FederatedDataSet  # NOQA
+    from openfl.federated.data.loader_xgb import XGBoostDataLoader  # NOQA
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		current_plan_name: default
kta-intel marked this conversation as resolved. Show resolved Hide resolved
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		../../workspace/plan/defaults
kta-intel marked this conversation as resolved. Show resolved Hide resolved
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Copyright (C) 2024 Intel Corporation
		# SPDX-License-Identifier: Apache-2.0