Skip to content

Commit

Permalink
Dev/mip 802/svm impl scikit (#442)
Browse files Browse the repository at this point in the history
* Adds test and expected for SVM

* Adds SVM algorithm and specs

* Changes name from mipengine to exareme2 in dependencies in svm algo

* Adds svm tests to prod env
  • Loading branch information
mollyk authored Sep 19, 2023
1 parent 083cd22 commit 5cc7366
Show file tree
Hide file tree
Showing 7 changed files with 1,230 additions and 0 deletions.
1 change: 1 addition & 0 deletions exareme2/algorithms/specifications.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ class AlgorithmName(str, Enum):
NAIVE_BAYES_GAUSSIAN_CV = "naive_bayes_gaussian_cv"
PCA = "pca"
PEARSON_CORRELATION = "pearson_correlation"
SVM_SCIKIT = "svm_scikit"
TTEST_INDEPENDENT = "ttest_independent"
TTEST_ONESAMPLE = "ttest_onesample"
TTEST_PAIRED = "ttest_paired"
Expand Down
61 changes: 61 additions & 0 deletions exareme2/algorithms/svm_scikit.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
{
"name": "svm_scikit",
"desc": "Divide datasets into classes to find a maximum marginal hyperplane.",
"label": "SVM",
"enabled": true,
"inputdata": {
"y": {
"label": "Classes",
"desc": "Classes of x.",
"types": [
"text",
"int",
"real"
],
"stattypes": [
"nominal"
],
"notblank": true,
"multiple": true
},
"x": {
"label": "Data points",
"desc": "Data points (support vectors) to be divided into classes.",
"types": [
"real",
"int"
],
"stattypes": [
"numerical"
],
"notblank": true,
"multiple": true
}
},
"parameters": {
"gamma": {
"label": "Gamma",
"desc": "Gamma parameter of RBF controls the distance of the influence of a single training point.",
"types": [
"real"
],
"notblank": true,
"multiple": false,
"default": 0.1,
"min": 0.0,
"max": 1.0
},
"C": {
"label": "C",
"desc": "C regularization parameter used to set the tolerance of the model to allow the misclassification of data points in order to achieve lower generalization error. The C value controls the penalty of misclassification.",
"types": [
"real"
],
"notblank": true,
"multiple": false,
"default": 1.0,
"min": 0.0,
"max": 1.0
}
}
}
140 changes: 140 additions & 0 deletions exareme2/algorithms/svm_scikit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
from typing import List
from typing import TypeVar

from pydantic import BaseModel

from exareme2.algorithms.algorithm import Algorithm
from exareme2.algorithms.algorithm import AlgorithmDataLoader
from exareme2.algorithms.fedaverage import fed_average
from exareme2.algorithms.helpers import get_transfer_data
from exareme2.algorithms.helpers import sum_secure_transfers
from exareme2.exceptions import BadUserInput
from exareme2.udfgen import literal
from exareme2.udfgen import relation
from exareme2.udfgen import secure_transfer
from exareme2.udfgen import udf

ALGORITHM_NAME = "svm_scikit"


class SVMDataLoader(AlgorithmDataLoader, algname=ALGORITHM_NAME):
def get_variable_groups(self):
return [self._variables.x, self._variables.y]


class SVMResult(BaseModel):
title: str
n_obs: int
coeff: List[float]
support_vectors: List[float]


class SVMAlgorithm(Algorithm, algname=ALGORITHM_NAME):
def run(self, data, metadata):
X, y = data
gamma = self.algorithm_parameters["gamma"]
C = self.algorithm_parameters["C"]

y_name = y.columns[0]
y_enums = metadata[y_name]["enumerations"].keys()

if len(y_enums) < 2:
raise BadUserInput(
f"The variable {y_name} has less than 2 levels and SVM cannot be "
"performed. Please choose another variable."
)

models = SVMFedAverage(self.engine)
models.fit(X, y, gamma, C)

result = SVMResult(
title="SVM Result",
n_obs=models.nobs_train,
coeff=models.coeff,
support_vectors=models.support_vectors,
)

return result


S = TypeVar("S")


class SVMFedAverage:
def __init__(self, engine):
self.num_local_nodes = engine.num_local_nodes
self.local_run = engine.run_udf_on_local_nodes
self.global_run = engine.run_udf_on_global_node

def fit(self, x, y, gamma, C):
params_to_average, other_params = self.local_run(
func=self._fit_local,
keyword_args={"x": x, "y": y, "gamma": gamma, "C": C},
share_to_global=[True, True],
)
averaged_params_table = self.global_run(
func=fed_average,
keyword_args=dict(
params=params_to_average, num_local_nodes=self.num_local_nodes
),
)
other_params_table = self.global_run(
func=sum_secure_transfers,
keyword_args=dict(loctransf=other_params),
)

averaged_params = get_transfer_data(averaged_params_table)
other_params = get_transfer_data(other_params_table)

self.coeff = averaged_params["coeff"]
self.support_vectors = averaged_params["support_vectors"]
self.nobs_train = other_params["nobs_train"]

@staticmethod
@udf(
x=relation(schema=S),
y=relation(schema=S),
gamma=literal(),
C=literal(),
return_type=[secure_transfer(sum_op=True), secure_transfer(sum_op=True)],
)
def _fit_local(x, y, gamma, C):
import numpy as np
from sklearn.svm import SVC

y = y.to_numpy()
X = x.to_numpy()

y_unq = np.unique(y)
if len(y_unq) < 2:
raise ValueError("Cannot perform SVM. Covariable has only one level.")

model = SVC(kernel="linear", gamma=gamma, C=C)
model.fit(X, y)

if len(model.coef_) < 2:
coeff = [model.coef_.squeeze().tolist()]
else:
coeff = model.coef_.squeeze().tolist()

params_to_average = {}
params_to_average["coeff"] = {
"data": coeff,
"operation": "sum",
"type": "float",
}
params_to_average["support_vectors"] = {
"data": model.support_vectors_.squeeze().tolist(),
"operation": "sum",
"type": "float",
}

other_params = {
"nobs_train": {
"data": len(y),
"operation": "sum",
"type": "int",
}
} # other quantities not meant to be averaged

return params_to_average, other_params
Loading

0 comments on commit 5cc7366

Please sign in to comment.