Skip to content

Commit

Permalink
add openml support & fix issues
Browse files Browse the repository at this point in the history
- add cat indicator to train/test methods
- fix tests with tuple inputs
  • Loading branch information
SuryaThiru committed Feb 26, 2021
1 parent 42a0649 commit 4b79978
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 47 deletions.
2 changes: 1 addition & 1 deletion mlgauge/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.2
0.3
124 changes: 98 additions & 26 deletions mlgauge/analysis.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
import os
import pmlb
from copy import deepcopy

from tqdm import tqdm
import numpy as np
import pandas as pd
import xarray as xr
import seaborn as sns
import pmlb
import openml

from tqdm import tqdm
from copy import deepcopy
from sklearn.model_selection import train_test_split
from sklearn.utils import check_random_state

Expand All @@ -34,16 +35,20 @@ class Analysis:
result.loc['houses', 'linear', 'mse', 'test']
.. note::
When integer IDs are specified for openml datasets, the ``results`` attribute's dataset key will be set as string.
Refer the documentation of `xarray <https://xarray.pydata.org/en/stable/quick-overview.html>`_ for a more detailed usage.
"""

# TODO allow metrics to be null
def __init__(
self,
methods,
metric_names=None,
datasets="all",
n_datasets=20,
data_source="pmlb",
drop_na=False,
use_test_set=True,
test_size=0.25,
Expand All @@ -66,7 +71,8 @@ def __init__(
**"regression"**: randomly select `n_datasets` from all available regression datasets in pmlb.
**list of strings**: a list of valid pmlb dataset names.
**list of strings**: a list of valid pmlb/openml dataset names.
**list of ints**: a list of valid openml dataset IDs. This is recommended for openml to avoid issues with versions.
**list of ('dataset_name', (X, y)) tuples**: Use the method to pass a custom dataset in the X y format.
Expand All @@ -75,7 +81,7 @@ def __init__(
Here, X y could be a numpy array or a pandas DataFrame, using a DataFrame will allow the input feature names to be passed to the methods.
n_datasets (int): Number of datasets to randomly sample from the available pmlb datasets. Ignored if `datasets` is not a string.
data_source (str): Source to fetch from when dataset names/IDs are passed. 'pmlb' or 'openml'
drop_na (bool): If True will drop all rows in the dataset with null values.
random_state (None, int or RandomState instance): seed for the PRNG.
use_test_set (bool): If the methods use a testing set.
Expand All @@ -91,6 +97,12 @@ def __init__(
self.__methods = self._precheck_methods(methods)
self.metric_names = metric_names

if data_source != "openml" and data_source != "pmlb":
raise TypeError("Data source must be 'openml' or 'pmlb'")
if data_source == "openml" and not isinstance(datasets, list):
raise TypeError("Provide list of dataset IDs/names for openml")

self.data_source = data_source
self.datasets = self._precheck_dataset(datasets)
if isinstance(
self.datasets, str
Expand Down Expand Up @@ -129,27 +141,35 @@ def run(self):
with redirect_stdout() as stdout:

# linespacing logic (18 additional chars for title etc.)
maxl = min(max([len(x) for x in self.datasets]) + 18, 80)
_datasets = map(
lambda x: x[0] if isinstance(x, tuple) else x, self.datasets
) # get dataset names
maxl = min(max([len(str(x)) for x in _datasets]) + 18, 80)

# iterate datasets
datasets = tqdm(self.datasets, file=stdout, dynamic_ncols=True)
for dataset in datasets:
_dataset_name = dataset[0] if isinstance(dataset, tuple) else dataset
datasets.set_description(
f"{{0: <{maxl}}}".format(
f"{colors.GREEN}Datasets [{dataset}]{colors.ENDC}"
f"{colors.GREEN}Datasets [{_dataset_name}]{colors.ENDC}"
)
)
if self.use_test_set:
(
dataset_name,
feature_names,
category_indicator,
(X_train, y_train),
(X_test, y_test),
) = self._get_dataset(dataset)
else:
dataset_name, feature_names, (X_train, y_train) = self._get_dataset(
dataset
)
(
dataset_name,
feature_names,
category_indicator,
(X_train, y_train),
) = self._get_dataset(dataset)

# iterate methods
methods = tqdm(
Expand All @@ -166,17 +186,25 @@ def run(self):
method.set_test_set(self.use_test_set)
# create output directory
output_dir = os.path.join(
self.output_dir, dataset_name, method_name
self.output_dir, str(dataset_name), method_name
)
os.makedirs(output_dir, exist_ok=True)
method.set_output_dir(output_dir)

# get training scores
train_scores = method.train(X_train, y_train, feature_names)
train_scores = method.train(
X_train, y_train, feature_names, category_indicator
)

# change keys for result to string
if isinstance(dataset_name, int):
dataset_name = str(dataset_name)

# get optional testing scores
if self.use_test_set:
test_scores = method.test(X_test, y_test, feature_names)
test_scores = method.test(
X_test, y_test, feature_names, category_indicator
)
if self.metric_names:
self.results.loc[
dataset_name, method_name, :, "train"
Expand Down Expand Up @@ -208,13 +236,24 @@ def _precheck_dataset(self, datasets):
for d in datasets:
if isinstance(d, str):
# should be a valid pmlb dataset name
if d not in pmlb.dataset_names:
raise ValueError(f"Dataset {d} not in pmlb")
if self.data_source == "pmlb":
if d not in pmlb.dataset_names:
raise ValueError(f"Dataset {d} not in pmlb")

elif isinstance(d, int):
if self.data_source != "openml":
raise ValueError("Integer data IDs are only valid for OpenML")

elif isinstance(d, tuple):
if not isinstance(d[0], str):
raise ValueError(
"First element of the tuple must be the name of the dataset"
)
if len(d) not in [2, 3]:
raise ValueError(
"Custom dataset input should be a tuple of length 2 or 3"
)

else:
raise TypeError(f"Invalid type {type(d)} for dataset.")
else:
Expand Down Expand Up @@ -275,19 +314,27 @@ def _initialize_results(self):

def _get_dataset_name(self, dataset):
"""Get the supplied name of the dataset"""
if isinstance(dataset, str):
if isinstance(dataset, str) or isinstance(dataset, int):
return dataset
elif isinstance(dataset, tuple):
return dataset[0]

def _get_dataset(self, dataset):
"""Load and return the dataset as X, y numpy arrays"""
if isinstance(dataset, str): # Use pmlb
data = pmlb.fetch_data(dataset, local_cache_dir=self.local_cache_dir)
category_indicator = None # list indicating categorical columns

if isinstance(dataset, str): # Use pmlb or openml
if self.data_source == "pmlb":
data = pmlb.fetch_data(dataset, local_cache_dir=self.local_cache_dir)

# Get feature names and get X,y numpy arrays
X = data.drop("target", axis=1)
y = data["target"]
# Get feature names and get X,y numpy arrays
X = data.drop("target", axis=1)
y = data["target"]
elif self.data_source == "openml":
X, y, category_indicator = self._fetch_openml_data(dataset)

elif isinstance(dataset, int):
X, y, category_indicator = self._fetch_openml_data(dataset)

elif isinstance(dataset, tuple):
if len(dataset) == 2:
Expand All @@ -300,9 +347,20 @@ def _get_dataset(self, dataset):

if self.use_test_set:
X_test, y_test = self._format_na(X_test, y_test)
return dataset, feature_names, (X_train, y_train), (X_test, y_test)
return (
dataset,
feature_names,
category_indicator,
(X_train, y_train),
(X_test, y_test),
)
else:
return dataset, feature_names, (X_train, y_train)
return (
dataset,
feature_names,
category_indicator,
(X_train, y_train),
)

if self.use_test_set: # Perform train-test splits
X_train, X_test, y_train, y_test = train_test_split(
Expand All @@ -315,11 +373,25 @@ def _get_dataset(self, dataset):
feature_names = self._get_feature_names(X_train)
X_train, y_train = self._format_na(X_train, y_train)
X_test, y_test = self._format_na(X_test, y_test)
return dataset, feature_names, (X_train, y_train), (X_test, y_test)
return (
dataset,
feature_names,
category_indicator,
(X_train, y_train),
(X_test, y_test),
)
else: # Directly format and return train set
feature_names = self._get_feature_names(X)
X_train, y_train = self._format_na(X, y)
return dataset, feature_names, (X_train, y_train)
return dataset, feature_names, category_indicator, (X_train, y_train)

def _fetch_openml_data(self, dataset_id):
"""Get the openml dataset with the category indicator"""
data = openml.datasets.get_dataset(dataset_id)
X, y, category_indicator, attribute_names = data.get_data(
dataset_format="dataframe", target=data.default_target_attribute
)
return X, y, category_indicator

def _get_feature_names(self, X):
"""Get the list of feature names from input data"""
Expand Down
14 changes: 9 additions & 5 deletions mlgauge/method.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,26 +43,28 @@ def set_test_set(self, use_test_set):
"""
self.use_test_set = use_test_set

def train(self, X_train, y_train, feature_names=None):
def train(self, X_train, y_train, feature_names=None, category_indicator=None):
"""Train the model and return the training score.
Args:
X_train (array): array of training vector.
y_train (array): array of target vector.
feature_names (list): list of names of the features in X_train
feature_names (list): list of names of the features in X_train.
category_indicator (list): list of boolean indicating whether a feature is a categorical variable.
Raises:
NotImplementedError: raised when called from the base class.
"""
raise NotImplementedError

def test(self, X_test, y_test, feature_names=None):
def test(self, X_test, y_test, feature_names=None, category_indicator=None):
"""Evaluate the model and return the test score.
Args:
X_test (array): array of training vector.
y_test (array): array of target vector.
feature_names (list): list of names of the features in X_test
category_indicator (list): list of boolean indicating whether a feature is a categorical variable.
Raises:
NotImplementedError: raised when called from the base class.
Expand Down Expand Up @@ -90,13 +92,14 @@ def __init__(self, estimator, metrics, export_model=False, cv=5):
self.export_model = export_model
self.cv = cv

def train(self, X_train, y_train, feature_names=None):
def train(self, X_train, y_train, feature_names=None, category_indicator=None):
"""Train the model and return the training score.
Args:
X_train (array): array of training vector.
y_train (array): array of target vector.
feature_names (list): list of names of the features in X_train
category_indicator (list): list of boolean indicating whether a feature is a categorical variable.
Returns:
list: list of metric scores evaluated on the training data.
Expand Down Expand Up @@ -135,13 +138,14 @@ def train(self, X_train, y_train, feature_names=None):

return [scores_dict["test_" + key] for key in self.metrics]

def test(self, X_test, y_test, feature_names=None):
def test(self, X_test, y_test, feature_names=None, category_indicator=None):
"""Evaluate the model and return the test score.
Args:
X_test (array): array of training vector.
y_test (array): array of target vector.
feature_names (list): list of names of the features in X_test
category_indicator (list): list of boolean indicating whether a feature is a categorical variable.
Returns:
list: list of metric scores evaluated on the testing data.
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
openml>=0.11.0
pmlb>=1.0
pandas>=1.0.3
scikit-learn>=0.22.2
Expand Down
Loading

0 comments on commit 4b79978

Please sign in to comment.