-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Adds test and expected for SVM * Adds SVM algorithm and specs * Changes name from mipengine to exareme2 in dependencies in svm algo * Adds svm tests to prod env
- Loading branch information
Showing
7 changed files
with
1,230 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
{ | ||
"name": "svm_scikit", | ||
"desc": "Divide datasets into classes to find a maximum marginal hyperplane.", | ||
"label": "SVM", | ||
"enabled": true, | ||
"inputdata": { | ||
"y": { | ||
"label": "Classes", | ||
"desc": "Classes of x.", | ||
"types": [ | ||
"text", | ||
"int", | ||
"real" | ||
], | ||
"stattypes": [ | ||
"nominal" | ||
], | ||
"notblank": true, | ||
"multiple": true | ||
}, | ||
"x": { | ||
"label": "Data points", | ||
"desc": "Data points (support vectors) to be divided into classes.", | ||
"types": [ | ||
"real", | ||
"int" | ||
], | ||
"stattypes": [ | ||
"numerical" | ||
], | ||
"notblank": true, | ||
"multiple": true | ||
} | ||
}, | ||
"parameters": { | ||
"gamma": { | ||
"label": "Gamma", | ||
"desc": "Gamma parameter of RBF controls the distance of the influence of a single training point.", | ||
"types": [ | ||
"real" | ||
], | ||
"notblank": true, | ||
"multiple": false, | ||
"default": 0.1, | ||
"min": 0.0, | ||
"max": 1.0 | ||
}, | ||
"C": { | ||
"label": "C", | ||
"desc": "C regularization parameter used to set the tolerance of the model to allow the misclassification of data points in order to achieve lower generalization error. The C value controls the penalty of misclassification.", | ||
"types": [ | ||
"real" | ||
], | ||
"notblank": true, | ||
"multiple": false, | ||
"default": 1.0, | ||
"min": 0.0, | ||
"max": 1.0 | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
from typing import List | ||
from typing import TypeVar | ||
|
||
from pydantic import BaseModel | ||
|
||
from exareme2.algorithms.algorithm import Algorithm | ||
from exareme2.algorithms.algorithm import AlgorithmDataLoader | ||
from exareme2.algorithms.fedaverage import fed_average | ||
from exareme2.algorithms.helpers import get_transfer_data | ||
from exareme2.algorithms.helpers import sum_secure_transfers | ||
from exareme2.exceptions import BadUserInput | ||
from exareme2.udfgen import literal | ||
from exareme2.udfgen import relation | ||
from exareme2.udfgen import secure_transfer | ||
from exareme2.udfgen import udf | ||
|
||
ALGORITHM_NAME = "svm_scikit" | ||
|
||
|
||
class SVMDataLoader(AlgorithmDataLoader, algname=ALGORITHM_NAME): | ||
def get_variable_groups(self): | ||
return [self._variables.x, self._variables.y] | ||
|
||
|
||
class SVMResult(BaseModel): | ||
title: str | ||
n_obs: int | ||
coeff: List[float] | ||
support_vectors: List[float] | ||
|
||
|
||
class SVMAlgorithm(Algorithm, algname=ALGORITHM_NAME): | ||
def run(self, data, metadata): | ||
X, y = data | ||
gamma = self.algorithm_parameters["gamma"] | ||
C = self.algorithm_parameters["C"] | ||
|
||
y_name = y.columns[0] | ||
y_enums = metadata[y_name]["enumerations"].keys() | ||
|
||
if len(y_enums) < 2: | ||
raise BadUserInput( | ||
f"The variable {y_name} has less than 2 levels and SVM cannot be " | ||
"performed. Please choose another variable." | ||
) | ||
|
||
models = SVMFedAverage(self.engine) | ||
models.fit(X, y, gamma, C) | ||
|
||
result = SVMResult( | ||
title="SVM Result", | ||
n_obs=models.nobs_train, | ||
coeff=models.coeff, | ||
support_vectors=models.support_vectors, | ||
) | ||
|
||
return result | ||
|
||
|
||
S = TypeVar("S") | ||
|
||
|
||
class SVMFedAverage: | ||
def __init__(self, engine): | ||
self.num_local_nodes = engine.num_local_nodes | ||
self.local_run = engine.run_udf_on_local_nodes | ||
self.global_run = engine.run_udf_on_global_node | ||
|
||
def fit(self, x, y, gamma, C): | ||
params_to_average, other_params = self.local_run( | ||
func=self._fit_local, | ||
keyword_args={"x": x, "y": y, "gamma": gamma, "C": C}, | ||
share_to_global=[True, True], | ||
) | ||
averaged_params_table = self.global_run( | ||
func=fed_average, | ||
keyword_args=dict( | ||
params=params_to_average, num_local_nodes=self.num_local_nodes | ||
), | ||
) | ||
other_params_table = self.global_run( | ||
func=sum_secure_transfers, | ||
keyword_args=dict(loctransf=other_params), | ||
) | ||
|
||
averaged_params = get_transfer_data(averaged_params_table) | ||
other_params = get_transfer_data(other_params_table) | ||
|
||
self.coeff = averaged_params["coeff"] | ||
self.support_vectors = averaged_params["support_vectors"] | ||
self.nobs_train = other_params["nobs_train"] | ||
|
||
@staticmethod | ||
@udf( | ||
x=relation(schema=S), | ||
y=relation(schema=S), | ||
gamma=literal(), | ||
C=literal(), | ||
return_type=[secure_transfer(sum_op=True), secure_transfer(sum_op=True)], | ||
) | ||
def _fit_local(x, y, gamma, C): | ||
import numpy as np | ||
from sklearn.svm import SVC | ||
|
||
y = y.to_numpy() | ||
X = x.to_numpy() | ||
|
||
y_unq = np.unique(y) | ||
if len(y_unq) < 2: | ||
raise ValueError("Cannot perform SVM. Covariable has only one level.") | ||
|
||
model = SVC(kernel="linear", gamma=gamma, C=C) | ||
model.fit(X, y) | ||
|
||
if len(model.coef_) < 2: | ||
coeff = [model.coef_.squeeze().tolist()] | ||
else: | ||
coeff = model.coef_.squeeze().tolist() | ||
|
||
params_to_average = {} | ||
params_to_average["coeff"] = { | ||
"data": coeff, | ||
"operation": "sum", | ||
"type": "float", | ||
} | ||
params_to_average["support_vectors"] = { | ||
"data": model.support_vectors_.squeeze().tolist(), | ||
"operation": "sum", | ||
"type": "float", | ||
} | ||
|
||
other_params = { | ||
"nobs_train": { | ||
"data": len(y), | ||
"operation": "sum", | ||
"type": "int", | ||
} | ||
} # other quantities not meant to be averaged | ||
|
||
return params_to_average, other_params |
Oops, something went wrong.