From 4b79978ec0c36f0f983d73ba88aa0357ad3ab84f Mon Sep 17 00:00:00 2001 From: Surya K Date: Fri, 26 Feb 2021 12:26:11 +0530 Subject: [PATCH] add openml support & fix issues - add cat indicator to train/test methods - fix tests with tuple inputs --- mlgauge/VERSION | 2 +- mlgauge/analysis.py | 124 ++++++++++++++++++++++++++++++++--------- mlgauge/method.py | 14 +++-- requirements.txt | 1 + tests/test_analysis.py | 48 +++++++++++----- 5 files changed, 142 insertions(+), 47 deletions(-) diff --git a/mlgauge/VERSION b/mlgauge/VERSION index 3b04cfb..be58634 100644 --- a/mlgauge/VERSION +++ b/mlgauge/VERSION @@ -1 +1 @@ -0.2 +0.3 diff --git a/mlgauge/analysis.py b/mlgauge/analysis.py index 8f24559..392ae3e 100644 --- a/mlgauge/analysis.py +++ b/mlgauge/analysis.py @@ -1,13 +1,14 @@ import os -import pmlb +from copy import deepcopy +from tqdm import tqdm import numpy as np import pandas as pd import xarray as xr import seaborn as sns +import pmlb +import openml -from tqdm import tqdm -from copy import deepcopy from sklearn.model_selection import train_test_split from sklearn.utils import check_random_state @@ -34,16 +35,20 @@ class Analysis: result.loc['houses', 'linear', 'mse', 'test'] + .. note:: + + When integer IDs are specified for openml datasets, the ``results`` attribute's dataset key will be set as string. + Refer the documentation of `xarray `_ for a more detailed usage. """ - # TODO allow metrics to be null def __init__( self, methods, metric_names=None, datasets="all", n_datasets=20, + data_source="pmlb", drop_na=False, use_test_set=True, test_size=0.25, @@ -66,7 +71,8 @@ def __init__( **"regression"**: randomly select `n_datasets` from all available regression datasets in pmlb. - **list of strings**: a list of valid pmlb dataset names. + **list of strings**: a list of valid pmlb/openml dataset names. + **list of ints**: a list of valid openml dataset IDs. This is recommended for openml to avoid issues with versions. **list of ('dataset_name', (X, y)) tuples**: Use the method to pass a custom dataset in the X y format. @@ -75,7 +81,7 @@ def __init__( Here, X y could be a numpy array or a pandas DataFrame, using a DataFrame will allow the input feature names to be passed to the methods. n_datasets (int): Number of datasets to randomly sample from the available pmlb datasets. Ignored if `datasets` is not a string. - + data_source (str): Source to fetch from when dataset names/IDs are passed. 'pmlb' or 'openml' drop_na (bool): If True will drop all rows in the dataset with null values. random_state (None, int or RandomState instance): seed for the PRNG. use_test_set (bool): If the methods use a testing set. @@ -91,6 +97,12 @@ def __init__( self.__methods = self._precheck_methods(methods) self.metric_names = metric_names + if data_source != "openml" and data_source != "pmlb": + raise TypeError("Data source must be 'openml' or 'pmlb'") + if data_source == "openml" and not isinstance(datasets, list): + raise TypeError("Provide list of dataset IDs/names for openml") + + self.data_source = data_source self.datasets = self._precheck_dataset(datasets) if isinstance( self.datasets, str @@ -129,27 +141,35 @@ def run(self): with redirect_stdout() as stdout: # linespacing logic (18 additional chars for title etc.) - maxl = min(max([len(x) for x in self.datasets]) + 18, 80) + _datasets = map( + lambda x: x[0] if isinstance(x, tuple) else x, self.datasets + ) # get dataset names + maxl = min(max([len(str(x)) for x in _datasets]) + 18, 80) # iterate datasets datasets = tqdm(self.datasets, file=stdout, dynamic_ncols=True) for dataset in datasets: + _dataset_name = dataset[0] if isinstance(dataset, tuple) else dataset datasets.set_description( f"{{0: <{maxl}}}".format( - f"{colors.GREEN}Datasets [{dataset}]{colors.ENDC}" + f"{colors.GREEN}Datasets [{_dataset_name}]{colors.ENDC}" ) ) if self.use_test_set: ( dataset_name, feature_names, + category_indicator, (X_train, y_train), (X_test, y_test), ) = self._get_dataset(dataset) else: - dataset_name, feature_names, (X_train, y_train) = self._get_dataset( - dataset - ) + ( + dataset_name, + feature_names, + category_indicator, + (X_train, y_train), + ) = self._get_dataset(dataset) # iterate methods methods = tqdm( @@ -166,17 +186,25 @@ def run(self): method.set_test_set(self.use_test_set) # create output directory output_dir = os.path.join( - self.output_dir, dataset_name, method_name + self.output_dir, str(dataset_name), method_name ) os.makedirs(output_dir, exist_ok=True) method.set_output_dir(output_dir) # get training scores - train_scores = method.train(X_train, y_train, feature_names) + train_scores = method.train( + X_train, y_train, feature_names, category_indicator + ) + + # change keys for result to string + if isinstance(dataset_name, int): + dataset_name = str(dataset_name) # get optional testing scores if self.use_test_set: - test_scores = method.test(X_test, y_test, feature_names) + test_scores = method.test( + X_test, y_test, feature_names, category_indicator + ) if self.metric_names: self.results.loc[ dataset_name, method_name, :, "train" @@ -208,13 +236,24 @@ def _precheck_dataset(self, datasets): for d in datasets: if isinstance(d, str): # should be a valid pmlb dataset name - if d not in pmlb.dataset_names: - raise ValueError(f"Dataset {d} not in pmlb") + if self.data_source == "pmlb": + if d not in pmlb.dataset_names: + raise ValueError(f"Dataset {d} not in pmlb") + + elif isinstance(d, int): + if self.data_source != "openml": + raise ValueError("Integer data IDs are only valid for OpenML") + elif isinstance(d, tuple): if not isinstance(d[0], str): raise ValueError( "First element of the tuple must be the name of the dataset" ) + if len(d) not in [2, 3]: + raise ValueError( + "Custom dataset input should be a tuple of length 2 or 3" + ) + else: raise TypeError(f"Invalid type {type(d)} for dataset.") else: @@ -275,19 +314,27 @@ def _initialize_results(self): def _get_dataset_name(self, dataset): """Get the supplied name of the dataset""" - if isinstance(dataset, str): + if isinstance(dataset, str) or isinstance(dataset, int): return dataset elif isinstance(dataset, tuple): return dataset[0] def _get_dataset(self, dataset): """Load and return the dataset as X, y numpy arrays""" - if isinstance(dataset, str): # Use pmlb - data = pmlb.fetch_data(dataset, local_cache_dir=self.local_cache_dir) + category_indicator = None # list indicating categorical columns + + if isinstance(dataset, str): # Use pmlb or openml + if self.data_source == "pmlb": + data = pmlb.fetch_data(dataset, local_cache_dir=self.local_cache_dir) - # Get feature names and get X,y numpy arrays - X = data.drop("target", axis=1) - y = data["target"] + # Get feature names and get X,y numpy arrays + X = data.drop("target", axis=1) + y = data["target"] + elif self.data_source == "openml": + X, y, category_indicator = self._fetch_openml_data(dataset) + + elif isinstance(dataset, int): + X, y, category_indicator = self._fetch_openml_data(dataset) elif isinstance(dataset, tuple): if len(dataset) == 2: @@ -300,9 +347,20 @@ def _get_dataset(self, dataset): if self.use_test_set: X_test, y_test = self._format_na(X_test, y_test) - return dataset, feature_names, (X_train, y_train), (X_test, y_test) + return ( + dataset, + feature_names, + category_indicator, + (X_train, y_train), + (X_test, y_test), + ) else: - return dataset, feature_names, (X_train, y_train) + return ( + dataset, + feature_names, + category_indicator, + (X_train, y_train), + ) if self.use_test_set: # Perform train-test splits X_train, X_test, y_train, y_test = train_test_split( @@ -315,11 +373,25 @@ def _get_dataset(self, dataset): feature_names = self._get_feature_names(X_train) X_train, y_train = self._format_na(X_train, y_train) X_test, y_test = self._format_na(X_test, y_test) - return dataset, feature_names, (X_train, y_train), (X_test, y_test) + return ( + dataset, + feature_names, + category_indicator, + (X_train, y_train), + (X_test, y_test), + ) else: # Directly format and return train set feature_names = self._get_feature_names(X) X_train, y_train = self._format_na(X, y) - return dataset, feature_names, (X_train, y_train) + return dataset, feature_names, category_indicator, (X_train, y_train) + + def _fetch_openml_data(self, dataset_id): + """Get the openml dataset with the category indicator""" + data = openml.datasets.get_dataset(dataset_id) + X, y, category_indicator, attribute_names = data.get_data( + dataset_format="dataframe", target=data.default_target_attribute + ) + return X, y, category_indicator def _get_feature_names(self, X): """Get the list of feature names from input data""" diff --git a/mlgauge/method.py b/mlgauge/method.py index d98df5e..af97479 100644 --- a/mlgauge/method.py +++ b/mlgauge/method.py @@ -43,26 +43,28 @@ def set_test_set(self, use_test_set): """ self.use_test_set = use_test_set - def train(self, X_train, y_train, feature_names=None): + def train(self, X_train, y_train, feature_names=None, category_indicator=None): """Train the model and return the training score. Args: X_train (array): array of training vector. y_train (array): array of target vector. - feature_names (list): list of names of the features in X_train + feature_names (list): list of names of the features in X_train. + category_indicator (list): list of boolean indicating whether a feature is a categorical variable. Raises: NotImplementedError: raised when called from the base class. """ raise NotImplementedError - def test(self, X_test, y_test, feature_names=None): + def test(self, X_test, y_test, feature_names=None, category_indicator=None): """Evaluate the model and return the test score. Args: X_test (array): array of training vector. y_test (array): array of target vector. feature_names (list): list of names of the features in X_test + category_indicator (list): list of boolean indicating whether a feature is a categorical variable. Raises: NotImplementedError: raised when called from the base class. @@ -90,13 +92,14 @@ def __init__(self, estimator, metrics, export_model=False, cv=5): self.export_model = export_model self.cv = cv - def train(self, X_train, y_train, feature_names=None): + def train(self, X_train, y_train, feature_names=None, category_indicator=None): """Train the model and return the training score. Args: X_train (array): array of training vector. y_train (array): array of target vector. feature_names (list): list of names of the features in X_train + category_indicator (list): list of boolean indicating whether a feature is a categorical variable. Returns: list: list of metric scores evaluated on the training data. @@ -135,13 +138,14 @@ def train(self, X_train, y_train, feature_names=None): return [scores_dict["test_" + key] for key in self.metrics] - def test(self, X_test, y_test, feature_names=None): + def test(self, X_test, y_test, feature_names=None, category_indicator=None): """Evaluate the model and return the test score. Args: X_test (array): array of training vector. y_test (array): array of target vector. feature_names (list): list of names of the features in X_test + category_indicator (list): list of boolean indicating whether a feature is a categorical variable. Returns: list: list of metric scores evaluated on the testing data. diff --git a/requirements.txt b/requirements.txt index 3486629..83f43a5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +openml>=0.11.0 pmlb>=1.0 pandas>=1.0.3 scikit-learn>=0.22.2 diff --git a/tests/test_analysis.py b/tests/test_analysis.py index df3e8c7..7fd5c01 100644 --- a/tests/test_analysis.py +++ b/tests/test_analysis.py @@ -60,7 +60,7 @@ def test_string(self, regressor, tmp_path): ) assert len(an.datasets) == 5 - def test_list(self, regressor, tmp_path): + def test_pmlb_list(self, regressor, tmp_path): # should work with a list of valid pmlb datasets names an = Analysis( methods=[("dummy", regressor)], @@ -83,16 +83,34 @@ def test_list(self, regressor, tmp_path): local_cache_dir=PMLB_CACHE, ) + def test_openml_list(self, regressor, tmp_path): + # should work with a list of valid openml datasets names + an = Analysis( + methods=[("dummy", regressor)], + metric_names=["r2", "max_error"], + datasets=[ + "wind", + "cpu_small", + 1030, + ], # sometimes failes due to issues with busy openml servers + data_source="openml", + random_state=SEED, + output_dir=tmp_path, + local_cache_dir=PMLB_CACHE, + ) + assert len(an.datasets) == 3 + an.run() + def test_tuple(self, regressor, tmp_path): # should works with a list of (X, y) tuples datasets = [ ( "data_1", - *make_regression(n_samples=200, n_features=5, random_state=SEED), + make_regression(n_samples=200, n_features=5, random_state=SEED), ), ( "data_2", - *make_regression(n_samples=1000, n_features=50, random_state=SEED), + make_regression(n_samples=1000, n_features=50, random_state=SEED), ), ] an = Analysis( @@ -110,16 +128,16 @@ def test_tuple_train_test(self, regressor, tmp_path): datasets = [ ( "data_1", - *make_regression(n_samples=200, n_features=5, random_state=SEED), + make_regression(n_samples=200, n_features=5, random_state=SEED), ), ( "data_2", - *make_regression(n_samples=1000, n_features=50, random_state=SEED), + make_regression(n_samples=1000, n_features=50, random_state=SEED), ), ] def test_split(data): - name, X, y = data + name, (X, y) = data X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=SEED) return name, (X_train, y_train), (X_test, y_test) @@ -135,27 +153,25 @@ def test_split(data): ) assert len(an.datasets) == 2 - def test_mixed(self, regressor, tmp_path): + def test_pmlb_mixed(self, regressor, tmp_path): # should work with a mix of strings, tuples, tuple of tuples datasets = [ ( "data_1", - *make_regression(n_samples=200, n_features=5, random_state=SEED), + make_regression(n_samples=200, n_features=5, random_state=SEED), ), ( "data_2", - *make_regression(n_samples=1000, n_features=50, random_state=SEED), + make_regression(n_samples=1000, n_features=50, random_state=SEED), ), ] def test_split(data): - name, X, y = data + name, (X, y) = data X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=SEED) return name, (X_train, y_train), (X_test, y_test) - datasets = ( - list(map(test_split, datasets)) + datasets + ["adult", "cars", "pima"] - ) + datasets = list(map(test_split, datasets)) + datasets + ["adult"] an = Analysis( methods=[("dummy", regressor)], @@ -164,8 +180,10 @@ def test_split(data): random_state=SEED, output_dir=tmp_path, local_cache_dir=PMLB_CACHE, + use_test_set=True, ) - assert len(an.datasets) == 7 + assert len(an.datasets) == 5 + an.run() # Test dropna @@ -176,7 +194,7 @@ def __init__(self, dropna): self.drop_na = dropna super().__init__() - def train(self, X, y, feature_names): + def train(self, X, y, feature_names, category_indicator=None): dat = np.hstack([X, y.reshape(-1, 1)]) if self.drop_na: assert not np.isnan(dat).any(), "data has missing values"