From 0a8bf513ee95dfa712207055557ce3c96f9c182a Mon Sep 17 00:00:00 2001 From: Miles Granger Date: Tue, 4 Jun 2019 17:27:53 +0200 Subject: [PATCH] Support predicting n number of tags Update serializer to recognize parameters as possible path.to.Model which will be loaded and passed back to the class specifying that path. Such that Model(estimator="sklearn.fancy.FancyModel") will be loaded as Model(estimator=FancyModel()). This then allows using things like sklearn.multioutput.MultiOutputRegressor which needs an 'estimator' as a parameter in the config file specification. --- gordo_components/model/models.py | 1 - .../serializer/pipeline_from_definition.py | 86 +++++++++++++++++++ gordo_components/server/views/base.py | 11 +++ .../test_serializer_from_definition.py | 32 +++++++ 4 files changed, 129 insertions(+), 1 deletion(-) diff --git a/gordo_components/model/models.py b/gordo_components/model/models.py index 8dd2f4ae3..a08fa1d1f 100644 --- a/gordo_components/model/models.py +++ b/gordo_components/model/models.py @@ -22,7 +22,6 @@ from sklearn.exceptions import NotFittedError from gordo_components.model.base import GordoBase - # This is required to run `register_model_builder` against registered factories from gordo_components.model.factories import * # pragma: no flakes diff --git a/gordo_components/serializer/pipeline_from_definition.py b/gordo_components/serializer/pipeline_from_definition.py index 9d64d50e4..fa35d5fd7 100644 --- a/gordo_components/serializer/pipeline_from_definition.py +++ b/gordo_components/serializer/pipeline_from_definition.py @@ -115,6 +115,10 @@ def _build_step( import_str = list(step.keys())[0] params = step.get(import_str, dict()) + # Load any possible classes in the params if this is a dict of maybe kwargs + if isinstance(params, dict): + params = _load_param_classes(params) + StepClass = pydoc.locate( import_str ) # type: Union[FeatureUnion, Pipeline, BaseEstimator] @@ -169,3 +173,85 @@ def _build_step( raise ValueError( f"Expected step to be either a string or a dict," f"found: {type(step)}" ) + + +def _load_param_classes(params: dict): + """ + Inspect the params' values and determine if any can be loaded as a class. + if so, update that param's key value as the instantiation of the class. + + Additionally, if the value of the top level is a dict, and that dict's len(.keys()) == 1 + AND that key can be loaded, it's assumed to be a class whose associated values + should be passed in as kwargs. + + Parameters + ---------- + params: dict + key value pairs of kwargs, which can have full class paths defined. + + Examples + -------- + >>> params = {"key1": "value1"} + >>> assert _load_param_classes(params) == params # No modifications + + # Load an actual model, without any kwargs + >>> from sklearn.ensemble import RandomForestRegressor + >>> params = {"base_estimator": "sklearn.ensemble.forest.RandomForestRegressor"} + >>> print(_load_param_classes(params)) + {'base_estimator': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, + max_features='auto', max_leaf_nodes=None, + min_impurity_decrease=0.0, min_impurity_split=None, + min_samples_leaf=1, min_samples_split=2, + min_weight_fraction_leaf=0.0, n_estimators='warn', + n_jobs=None, oob_score=False, random_state=None, + verbose=0, warm_start=False)} + + # Load an actual model, with kwargs + >>> params = {"base_estimator": {"sklearn.ensemble.forest.RandomForestRegressor": {"n_estimators": 20}}} + >>> print(_load_param_classes(params)) + {'base_estimator': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, + max_features='auto', max_leaf_nodes=None, + min_impurity_decrease=0.0, min_impurity_split=None, + min_samples_leaf=1, min_samples_split=2, + min_weight_fraction_leaf=0.0, n_estimators=20, + n_jobs=None, oob_score=False, random_state=None, + verbose=0, warm_start=False)} + + + Returns + ------- + dict + Updated params which has any possible class paths loaded up as instantiated + objects + """ + params = copy.copy(params) + for key, value in params.items(): + + # If value is a simple string, try to load the model/class + if isinstance(value, str): + Model = pydoc.locate(value) + if ( + Model is not None + and isinstance(Model, type) + and issubclass(Model, BaseEstimator) + ): + + params[key] = Model() + + # For the next bit to work, the dict must have a single key (maybe) the class path, + # and its value must be a dict of kwargs + if ( + isinstance(value, dict) + and len(value.keys()) == 1 + and isinstance(value[list(value.keys())[0]], dict) + ): + Model = pydoc.locate(list(value.keys())[0]) + if ( + Model is not None + and isinstance(Model, type) + and issubclass(Model, BaseEstimator) + ): + # Call this func again, incase there is nested occurances of this problem in these kwargs + sub_params = value[list(value.keys())[0]] + params[key] = Model(**_load_param_classes(sub_params)) + return params diff --git a/gordo_components/server/views/base.py b/gordo_components/server/views/base.py index 03fb1a59e..43676b00d 100644 --- a/gordo_components/server/views/base.py +++ b/gordo_components/server/views/base.py @@ -88,6 +88,15 @@ def frequency(self): def tags(self) -> typing.List[SensorTag]: return normalize_sensor_tags(current_app.metadata["dataset"]["tag_list"]) + @property + def target_tags(self) -> typing.List[SensorTag]: + if "target_tag_list" in current_app.metadata["dataset"]: + return normalize_sensor_tags( + current_app.metadata["dataset"]["target_tag_list"] + ) + else: + return [] + @staticmethod def _parse_iso_datetime(datetime_str: str) -> datetime: parsed_date = dateutil.parser.isoparse(datetime_str) # type: ignore @@ -283,6 +292,8 @@ def _process_request( self._data = data # Assign the base response DF for any children to use context["tags"] = self.tags + context["target-tags"] = self.target_tags + if data is not None: context["data"] = self.multi_lvl_column_dataframe_to_dict(data) return make_response((jsonify(context), context.pop("status-code", 200))) diff --git a/tests/gordo_components/serializer/test_serializer_from_definition.py b/tests/gordo_components/serializer/test_serializer_from_definition.py index 6ca99eb1a..0aadba707 100644 --- a/tests/gordo_components/serializer/test_serializer_from_definition.py +++ b/tests/gordo_components/serializer/test_serializer_from_definition.py @@ -6,10 +6,15 @@ import copy import pydoc +import pytest +import numpy as np + from sklearn.decomposition import PCA, TruncatedSVD from sklearn.pipeline import FeatureUnion, Pipeline from sklearn.preprocessing import MinMaxScaler, FunctionTransformer +from sklearn.multioutput import MultiOutputRegressor +from gordo_components import serializer from gordo_components.serializer import pipeline_from_definition import gordo_components.model.transformer_funcs.general from gordo_components.model.register import register_model_builder @@ -17,6 +22,33 @@ logger = logging.getLogger(__name__) +@pytest.mark.parametrize( + "definition", + [ + """ + sklearn.multioutput.MultiOutputRegressor: + estimator: sklearn.ensemble.forest.RandomForestRegressor + """, + """ + sklearn.multioutput.MultiOutputRegressor: + estimator: + sklearn.ensemble.forest.RandomForestRegressor: + n_estimators: 20 + """, + ], +) +def test_load_from_definition(definition): + """ + Ensure serializer can load models which take other models as parameters. + """ + X, y = np.random.random((10, 10)), np.random.random((10, 2)) + definition = yaml.load(definition, Loader=yaml.SafeLoader) + model = serializer.pipeline_from_definition(definition) + assert isinstance(model, MultiOutputRegressor) + model.fit(X, y) + model.predict(X) + + class ConfigToScikitLearnPipeTestCase(unittest.TestCase): def setup_gen(self): self.factories = register_model_builder.factories