Support predicting n number of tags

Update serializer to recognize parameters as possible path.to.Model which will be loaded and passed back to the class specifying that path. Such that Model(estimator="sklearn.fancy.FancyModel") will be loaded as Model(estimator=FancyModel()). This then allows using things like sklearn.multioutput.MultiOutputRegressor which needs an 'estimator' as a parameter in the config file specification.
equinor · Jun 26, 2019 · 0a8bf51 · 0a8bf51
1 parent 965b7c3
commit 0a8bf51
Show file tree

Hide file tree

Showing 4 changed files with 129 additions and 1 deletion.
diff --git a/gordo_components/model/models.py b/gordo_components/model/models.py
@@ -22,7 +22,6 @@
 from sklearn.exceptions import NotFittedError
 from gordo_components.model.base import GordoBase
 
-
 # This is required to run `register_model_builder` against registered factories
 from gordo_components.model.factories import *  # pragma: no flakes
 

diff --git a/gordo_components/serializer/pipeline_from_definition.py b/gordo_components/serializer/pipeline_from_definition.py
@@ -115,6 +115,10 @@ def _build_step(
         import_str = list(step.keys())[0]
         params = step.get(import_str, dict())
 
+        # Load any possible classes in the params if this is a dict of maybe kwargs
+        if isinstance(params, dict):
+            params = _load_param_classes(params)
+
         StepClass = pydoc.locate(
             import_str
         )  # type: Union[FeatureUnion, Pipeline, BaseEstimator]
@@ -169,3 +173,85 @@ def _build_step(
         raise ValueError(
             f"Expected step to be either a string or a dict," f"found: {type(step)}"
         )
+
+
+def _load_param_classes(params: dict):
+    """
+    Inspect the params' values and determine if any can be loaded as a class.
+    if so, update that param's key value as the instantiation of the class.
+
+    Additionally, if the value of the top level is a dict, and that dict's len(.keys()) == 1
+    AND that key can be loaded, it's assumed to be a class whose associated values
+    should be passed in as kwargs.
+
+    Parameters
+    ----------
+    params: dict
+        key value pairs of kwargs, which can have full class paths defined.
+
+    Examples
+    --------
+    >>> params = {"key1": "value1"}
+    >>> assert _load_param_classes(params) == params  # No modifications
+
+    # Load an actual model, without any kwargs
+    >>> from sklearn.ensemble import RandomForestRegressor
+    >>> params = {"base_estimator": "sklearn.ensemble.forest.RandomForestRegressor"}
+    >>> print(_load_param_classes(params))
+    {'base_estimator': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
+                          max_features='auto', max_leaf_nodes=None,
+                          min_impurity_decrease=0.0, min_impurity_split=None,
+                          min_samples_leaf=1, min_samples_split=2,
+                          min_weight_fraction_leaf=0.0, n_estimators='warn',
+                          n_jobs=None, oob_score=False, random_state=None,
+                          verbose=0, warm_start=False)}
+
+    # Load an actual model, with kwargs
+    >>> params = {"base_estimator": {"sklearn.ensemble.forest.RandomForestRegressor": {"n_estimators": 20}}}
+    >>> print(_load_param_classes(params))
+    {'base_estimator': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
+                          max_features='auto', max_leaf_nodes=None,
+                          min_impurity_decrease=0.0, min_impurity_split=None,
+                          min_samples_leaf=1, min_samples_split=2,
+                          min_weight_fraction_leaf=0.0, n_estimators=20,
+                          n_jobs=None, oob_score=False, random_state=None,
+                          verbose=0, warm_start=False)}
+
+
+    Returns
+    -------
+    dict
+        Updated params which has any possible class paths loaded up as instantiated
+        objects
+    """
+    params = copy.copy(params)
+    for key, value in params.items():
+
+        # If value is a simple string, try to load the model/class
+        if isinstance(value, str):
+            Model = pydoc.locate(value)
+            if (
+                Model is not None
+                and isinstance(Model, type)
+                and issubclass(Model, BaseEstimator)
+            ):
+
+                params[key] = Model()
+
+        # For the next bit to work, the dict must have a single key (maybe) the class path,
+        # and its value must be a dict of kwargs
+        if (
+            isinstance(value, dict)
+            and len(value.keys()) == 1
+            and isinstance(value[list(value.keys())[0]], dict)
+        ):
+            Model = pydoc.locate(list(value.keys())[0])
+            if (
+                Model is not None
+                and isinstance(Model, type)
+                and issubclass(Model, BaseEstimator)
+            ):
+                # Call this func again, incase there is nested occurances of this problem in these kwargs
+                sub_params = value[list(value.keys())[0]]
+                params[key] = Model(**_load_param_classes(sub_params))
+    return params
diff --git a/gordo_components/server/views/base.py b/gordo_components/server/views/base.py
@@ -88,6 +88,15 @@ def frequency(self):
     def tags(self) -> typing.List[SensorTag]:
         return normalize_sensor_tags(current_app.metadata["dataset"]["tag_list"])
 
+    @property
+    def target_tags(self) -> typing.List[SensorTag]:
+        if "target_tag_list" in current_app.metadata["dataset"]:
+            return normalize_sensor_tags(
+                current_app.metadata["dataset"]["target_tag_list"]
+            )
+        else:
+            return []
+
     @staticmethod
     def _parse_iso_datetime(datetime_str: str) -> datetime:
         parsed_date = dateutil.parser.isoparse(datetime_str)  # type: ignore
@@ -283,6 +292,8 @@ def _process_request(
             self._data = data  # Assign the base response DF for any children to use
 
         context["tags"] = self.tags
+        context["target-tags"] = self.target_tags
+
         if data is not None:
             context["data"] = self.multi_lvl_column_dataframe_to_dict(data)
         return make_response((jsonify(context), context.pop("status-code", 200)))

diff --git a/tests/gordo_components/serializer/test_serializer_from_definition.py b/tests/gordo_components/serializer/test_serializer_from_definition.py
@@ -6,17 +6,49 @@
 import copy
 import pydoc
 
+import pytest
+import numpy as np
+
 from sklearn.decomposition import PCA, TruncatedSVD
 from sklearn.pipeline import FeatureUnion, Pipeline
 from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
+from sklearn.multioutput import MultiOutputRegressor
 
+from gordo_components import serializer
 from gordo_components.serializer import pipeline_from_definition
 import gordo_components.model.transformer_funcs.general
 from gordo_components.model.register import register_model_builder
 
 logger = logging.getLogger(__name__)
 
 
+@pytest.mark.parametrize(
+    "definition",
+    [
+        """ 
+    sklearn.multioutput.MultiOutputRegressor:
+      estimator: sklearn.ensemble.forest.RandomForestRegressor
+    """,
+        """ 
+    sklearn.multioutput.MultiOutputRegressor:
+      estimator: 
+        sklearn.ensemble.forest.RandomForestRegressor:
+          n_estimators: 20
+    """,
+    ],
+)
+def test_load_from_definition(definition):
+    """
+    Ensure serializer can load models which take other models as parameters.
+    """
+    X, y = np.random.random((10, 10)), np.random.random((10, 2))
+    definition = yaml.load(definition, Loader=yaml.SafeLoader)
+    model = serializer.pipeline_from_definition(definition)
+    assert isinstance(model, MultiOutputRegressor)
+    model.fit(X, y)
+    model.predict(X)
+
+
 class ConfigToScikitLearnPipeTestCase(unittest.TestCase):
     def setup_gen(self):
         self.factories = register_model_builder.factories