Skip to content

Commit

Permalink
Support predicting n number of tags
Browse files Browse the repository at this point in the history
Update serializer to recognize parameters as possible path.to.Model
which will be loaded and passed back to the class specifying that
path. Such that Model(estimator="sklearn.fancy.FancyModel") will
be loaded as Model(estimator=FancyModel()).

This then allows using things like sklearn.multioutput.MultiOutputRegressor
which needs an 'estimator' as a parameter in the config file specification.
  • Loading branch information
milesgranger committed Jun 26, 2019
1 parent 965b7c3 commit 0a8bf51
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 1 deletion.
1 change: 0 additions & 1 deletion gordo_components/model/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
from sklearn.exceptions import NotFittedError
from gordo_components.model.base import GordoBase


# This is required to run `register_model_builder` against registered factories
from gordo_components.model.factories import * # pragma: no flakes

Expand Down
86 changes: 86 additions & 0 deletions gordo_components/serializer/pipeline_from_definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,10 @@ def _build_step(
import_str = list(step.keys())[0]
params = step.get(import_str, dict())

# Load any possible classes in the params if this is a dict of maybe kwargs
if isinstance(params, dict):
params = _load_param_classes(params)

StepClass = pydoc.locate(
import_str
) # type: Union[FeatureUnion, Pipeline, BaseEstimator]
Expand Down Expand Up @@ -169,3 +173,85 @@ def _build_step(
raise ValueError(
f"Expected step to be either a string or a dict," f"found: {type(step)}"
)


def _load_param_classes(params: dict):
"""
Inspect the params' values and determine if any can be loaded as a class.
if so, update that param's key value as the instantiation of the class.
Additionally, if the value of the top level is a dict, and that dict's len(.keys()) == 1
AND that key can be loaded, it's assumed to be a class whose associated values
should be passed in as kwargs.
Parameters
----------
params: dict
key value pairs of kwargs, which can have full class paths defined.
Examples
--------
>>> params = {"key1": "value1"}
>>> assert _load_param_classes(params) == params # No modifications
# Load an actual model, without any kwargs
>>> from sklearn.ensemble import RandomForestRegressor
>>> params = {"base_estimator": "sklearn.ensemble.forest.RandomForestRegressor"}
>>> print(_load_param_classes(params))
{'base_estimator': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators='warn',
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)}
# Load an actual model, with kwargs
>>> params = {"base_estimator": {"sklearn.ensemble.forest.RandomForestRegressor": {"n_estimators": 20}}}
>>> print(_load_param_classes(params))
{'base_estimator': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=20,
n_jobs=None, oob_score=False, random_state=None,
verbose=0, warm_start=False)}
Returns
-------
dict
Updated params which has any possible class paths loaded up as instantiated
objects
"""
params = copy.copy(params)
for key, value in params.items():

# If value is a simple string, try to load the model/class
if isinstance(value, str):
Model = pydoc.locate(value)
if (
Model is not None
and isinstance(Model, type)
and issubclass(Model, BaseEstimator)
):

params[key] = Model()

# For the next bit to work, the dict must have a single key (maybe) the class path,
# and its value must be a dict of kwargs
if (
isinstance(value, dict)
and len(value.keys()) == 1
and isinstance(value[list(value.keys())[0]], dict)
):
Model = pydoc.locate(list(value.keys())[0])
if (
Model is not None
and isinstance(Model, type)
and issubclass(Model, BaseEstimator)
):
# Call this func again, incase there is nested occurances of this problem in these kwargs
sub_params = value[list(value.keys())[0]]
params[key] = Model(**_load_param_classes(sub_params))
return params
11 changes: 11 additions & 0 deletions gordo_components/server/views/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,15 @@ def frequency(self):
def tags(self) -> typing.List[SensorTag]:
return normalize_sensor_tags(current_app.metadata["dataset"]["tag_list"])

@property
def target_tags(self) -> typing.List[SensorTag]:
if "target_tag_list" in current_app.metadata["dataset"]:
return normalize_sensor_tags(
current_app.metadata["dataset"]["target_tag_list"]
)
else:
return []

@staticmethod
def _parse_iso_datetime(datetime_str: str) -> datetime:
parsed_date = dateutil.parser.isoparse(datetime_str) # type: ignore
Expand Down Expand Up @@ -283,6 +292,8 @@ def _process_request(
self._data = data # Assign the base response DF for any children to use

context["tags"] = self.tags
context["target-tags"] = self.target_tags

if data is not None:
context["data"] = self.multi_lvl_column_dataframe_to_dict(data)
return make_response((jsonify(context), context.pop("status-code", 200)))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,49 @@
import copy
import pydoc

import pytest
import numpy as np

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.multioutput import MultiOutputRegressor

from gordo_components import serializer
from gordo_components.serializer import pipeline_from_definition
import gordo_components.model.transformer_funcs.general
from gordo_components.model.register import register_model_builder

logger = logging.getLogger(__name__)


@pytest.mark.parametrize(
"definition",
[
"""
sklearn.multioutput.MultiOutputRegressor:
estimator: sklearn.ensemble.forest.RandomForestRegressor
""",
"""
sklearn.multioutput.MultiOutputRegressor:
estimator:
sklearn.ensemble.forest.RandomForestRegressor:
n_estimators: 20
""",
],
)
def test_load_from_definition(definition):
"""
Ensure serializer can load models which take other models as parameters.
"""
X, y = np.random.random((10, 10)), np.random.random((10, 2))
definition = yaml.load(definition, Loader=yaml.SafeLoader)
model = serializer.pipeline_from_definition(definition)
assert isinstance(model, MultiOutputRegressor)
model.fit(X, y)
model.predict(X)


class ConfigToScikitLearnPipeTestCase(unittest.TestCase):
def setup_gen(self):
self.factories = register_model_builder.factories
Expand Down

0 comments on commit 0a8bf51

Please sign in to comment.