From 4b79978ec0c36f0f983d73ba88aa0357ad3ab84f Mon Sep 17 00:00:00 2001
From: Surya K <surya.thiru001@gmail.com>
Date: Fri, 26 Feb 2021 12:26:11 +0530
Subject: [PATCH] add openml support & fix issues - add cat indicator to
 train/test methods - fix tests with tuple inputs

---
 mlgauge/VERSION        |   2 +-
 mlgauge/analysis.py    | 124 ++++++++++++++++++++++++++++++++---------
 mlgauge/method.py      |  14 +++--
 requirements.txt       |   1 +
 tests/test_analysis.py |  48 +++++++++++-----
 5 files changed, 142 insertions(+), 47 deletions(-)

diff --git a/mlgauge/VERSION b/mlgauge/VERSION
index 3b04cfb..be58634 100644
--- a/mlgauge/VERSION
+++ b/mlgauge/VERSION
@@ -1 +1 @@
-0.2
+0.3
diff --git a/mlgauge/analysis.py b/mlgauge/analysis.py
index 8f24559..392ae3e 100644
--- a/mlgauge/analysis.py
+++ b/mlgauge/analysis.py
@@ -1,13 +1,14 @@
 import os
-import pmlb
+from copy import deepcopy
 
+from tqdm import tqdm
 import numpy as np
 import pandas as pd
 import xarray as xr
 import seaborn as sns
+import pmlb
+import openml
 
-from tqdm import tqdm
-from copy import deepcopy
 from sklearn.model_selection import train_test_split
 from sklearn.utils import check_random_state
 
@@ -34,16 +35,20 @@ class Analysis:
                                     result.loc['houses', 'linear', 'mse', 'test']
 
 
+                                .. note::
+
+                                    When integer IDs are specified for openml datasets, the ``results`` attribute's dataset key will be set as string.
+
                                 Refer the documentation of `xarray <https://xarray.pydata.org/en/stable/quick-overview.html>`_ for a more detailed usage.
     """
 
-    # TODO allow metrics to be null
     def __init__(
         self,
         methods,
         metric_names=None,
         datasets="all",
         n_datasets=20,
+        data_source="pmlb",
         drop_na=False,
         use_test_set=True,
         test_size=0.25,
@@ -66,7 +71,8 @@ def __init__(
 
                                 **"regression"**: randomly select `n_datasets` from all available regression datasets in pmlb.
 
-                                **list of strings**: a list of valid pmlb dataset names.
+                                **list of strings**: a list of valid pmlb/openml dataset names.
+                                **list of ints**: a list of valid openml dataset IDs. This is recommended for openml to avoid issues with versions.
 
                                 **list of ('dataset_name', (X, y)) tuples**: Use the method to pass a custom dataset in the X y format.
 
@@ -75,7 +81,7 @@ def __init__(
                             Here, X y could be a numpy array or a pandas DataFrame, using a DataFrame will allow the input feature names to be passed to the methods.
 
             n_datasets (int): Number of datasets to randomly sample from the available pmlb datasets. Ignored if `datasets` is not a string.
-
+            data_source (str): Source to fetch from when dataset names/IDs are passed. 'pmlb' or 'openml'
             drop_na (bool): If True will drop all rows in the dataset with null values.
             random_state (None, int or RandomState instance): seed for the PRNG.
             use_test_set (bool): If the methods use a testing set.
@@ -91,6 +97,12 @@ def __init__(
         self.__methods = self._precheck_methods(methods)
         self.metric_names = metric_names
 
+        if data_source != "openml" and data_source != "pmlb":
+            raise TypeError("Data source must be 'openml' or 'pmlb'")
+        if data_source == "openml" and not isinstance(datasets, list):
+            raise TypeError("Provide list of dataset IDs/names for openml")
+
+        self.data_source = data_source
         self.datasets = self._precheck_dataset(datasets)
         if isinstance(
             self.datasets, str
@@ -129,27 +141,35 @@ def run(self):
         with redirect_stdout() as stdout:
 
             # linespacing logic (18 additional chars for title etc.)
-            maxl = min(max([len(x) for x in self.datasets]) + 18, 80)
+            _datasets = map(
+                lambda x: x[0] if isinstance(x, tuple) else x, self.datasets
+            )  # get dataset names
+            maxl = min(max([len(str(x)) for x in _datasets]) + 18, 80)
 
             # iterate datasets
             datasets = tqdm(self.datasets, file=stdout, dynamic_ncols=True)
             for dataset in datasets:
+                _dataset_name = dataset[0] if isinstance(dataset, tuple) else dataset
                 datasets.set_description(
                     f"{{0: <{maxl}}}".format(
-                        f"{colors.GREEN}Datasets [{dataset}]{colors.ENDC}"
+                        f"{colors.GREEN}Datasets [{_dataset_name}]{colors.ENDC}"
                     )
                 )
                 if self.use_test_set:
                     (
                         dataset_name,
                         feature_names,
+                        category_indicator,
                         (X_train, y_train),
                         (X_test, y_test),
                     ) = self._get_dataset(dataset)
                 else:
-                    dataset_name, feature_names, (X_train, y_train) = self._get_dataset(
-                        dataset
-                    )
+                    (
+                        dataset_name,
+                        feature_names,
+                        category_indicator,
+                        (X_train, y_train),
+                    ) = self._get_dataset(dataset)
 
                 # iterate methods
                 methods = tqdm(
@@ -166,17 +186,25 @@ def run(self):
                     method.set_test_set(self.use_test_set)
                     # create output directory
                     output_dir = os.path.join(
-                        self.output_dir, dataset_name, method_name
+                        self.output_dir, str(dataset_name), method_name
                     )
                     os.makedirs(output_dir, exist_ok=True)
                     method.set_output_dir(output_dir)
 
                     # get training scores
-                    train_scores = method.train(X_train, y_train, feature_names)
+                    train_scores = method.train(
+                        X_train, y_train, feature_names, category_indicator
+                    )
+
+                    # change keys for result to string
+                    if isinstance(dataset_name, int):
+                        dataset_name = str(dataset_name)
 
                     # get optional testing scores
                     if self.use_test_set:
-                        test_scores = method.test(X_test, y_test, feature_names)
+                        test_scores = method.test(
+                            X_test, y_test, feature_names, category_indicator
+                        )
                         if self.metric_names:
                             self.results.loc[
                                 dataset_name, method_name, :, "train"
@@ -208,13 +236,24 @@ def _precheck_dataset(self, datasets):
             for d in datasets:
                 if isinstance(d, str):
                     # should be a valid pmlb dataset name
-                    if d not in pmlb.dataset_names:
-                        raise ValueError(f"Dataset {d} not in pmlb")
+                    if self.data_source == "pmlb":
+                        if d not in pmlb.dataset_names:
+                            raise ValueError(f"Dataset {d} not in pmlb")
+
+                elif isinstance(d, int):
+                    if self.data_source != "openml":
+                        raise ValueError("Integer data IDs are only valid for OpenML")
+
                 elif isinstance(d, tuple):
                     if not isinstance(d[0], str):
                         raise ValueError(
                             "First element of the tuple must be the name of the dataset"
                         )
+                    if len(d) not in [2, 3]:
+                        raise ValueError(
+                            "Custom dataset input should be a tuple of length 2 or 3"
+                        )
+
                 else:
                     raise TypeError(f"Invalid type {type(d)} for dataset.")
         else:
@@ -275,19 +314,27 @@ def _initialize_results(self):
 
     def _get_dataset_name(self, dataset):
         """Get the supplied name of the dataset"""
-        if isinstance(dataset, str):
+        if isinstance(dataset, str) or isinstance(dataset, int):
             return dataset
         elif isinstance(dataset, tuple):
             return dataset[0]
 
     def _get_dataset(self, dataset):
         """Load and return the dataset as X, y numpy arrays"""
-        if isinstance(dataset, str):  # Use pmlb
-            data = pmlb.fetch_data(dataset, local_cache_dir=self.local_cache_dir)
+        category_indicator = None  # list indicating categorical columns
+
+        if isinstance(dataset, str):  # Use pmlb or openml
+            if self.data_source == "pmlb":
+                data = pmlb.fetch_data(dataset, local_cache_dir=self.local_cache_dir)
 
-            # Get feature names and get X,y numpy arrays
-            X = data.drop("target", axis=1)
-            y = data["target"]
+                # Get feature names and get X,y numpy arrays
+                X = data.drop("target", axis=1)
+                y = data["target"]
+            elif self.data_source == "openml":
+                X, y, category_indicator = self._fetch_openml_data(dataset)
+
+        elif isinstance(dataset, int):
+            X, y, category_indicator = self._fetch_openml_data(dataset)
 
         elif isinstance(dataset, tuple):
             if len(dataset) == 2:
@@ -300,9 +347,20 @@ def _get_dataset(self, dataset):
 
                 if self.use_test_set:
                     X_test, y_test = self._format_na(X_test, y_test)
-                    return dataset, feature_names, (X_train, y_train), (X_test, y_test)
+                    return (
+                        dataset,
+                        feature_names,
+                        category_indicator,
+                        (X_train, y_train),
+                        (X_test, y_test),
+                    )
                 else:
-                    return dataset, feature_names, (X_train, y_train)
+                    return (
+                        dataset,
+                        feature_names,
+                        category_indicator,
+                        (X_train, y_train),
+                    )
 
         if self.use_test_set:  # Perform train-test splits
             X_train, X_test, y_train, y_test = train_test_split(
@@ -315,11 +373,25 @@ def _get_dataset(self, dataset):
             feature_names = self._get_feature_names(X_train)
             X_train, y_train = self._format_na(X_train, y_train)
             X_test, y_test = self._format_na(X_test, y_test)
-            return dataset, feature_names, (X_train, y_train), (X_test, y_test)
+            return (
+                dataset,
+                feature_names,
+                category_indicator,
+                (X_train, y_train),
+                (X_test, y_test),
+            )
         else:  # Directly format and return train set
             feature_names = self._get_feature_names(X)
             X_train, y_train = self._format_na(X, y)
-            return dataset, feature_names, (X_train, y_train)
+            return dataset, feature_names, category_indicator, (X_train, y_train)
+
+    def _fetch_openml_data(self, dataset_id):
+        """Get the openml dataset with the category indicator"""
+        data = openml.datasets.get_dataset(dataset_id)
+        X, y, category_indicator, attribute_names = data.get_data(
+            dataset_format="dataframe", target=data.default_target_attribute
+        )
+        return X, y, category_indicator
 
     def _get_feature_names(self, X):
         """Get the list of feature names from input data"""
diff --git a/mlgauge/method.py b/mlgauge/method.py
index d98df5e..af97479 100644
--- a/mlgauge/method.py
+++ b/mlgauge/method.py
@@ -43,26 +43,28 @@ def set_test_set(self, use_test_set):
         """
         self.use_test_set = use_test_set
 
-    def train(self, X_train, y_train, feature_names=None):
+    def train(self, X_train, y_train, feature_names=None, category_indicator=None):
         """Train the model and return the training score.
 
         Args:
             X_train (array): array of training vector.
             y_train (array): array of target vector.
-            feature_names (list): list of names of the features in X_train
+            feature_names (list): list of names of the features in X_train.
+            category_indicator (list): list of boolean indicating whether a feature is a categorical variable.
 
         Raises:
             NotImplementedError: raised when called from the base class.
         """
         raise NotImplementedError
 
-    def test(self, X_test, y_test, feature_names=None):
+    def test(self, X_test, y_test, feature_names=None, category_indicator=None):
         """Evaluate the model and return the test score.
 
         Args:
             X_test (array): array of training vector.
             y_test (array): array of target vector.
             feature_names (list): list of names of the features in X_test
+            category_indicator (list): list of boolean indicating whether a feature is a categorical variable.
 
         Raises:
             NotImplementedError: raised when called from the base class.
@@ -90,13 +92,14 @@ def __init__(self, estimator, metrics, export_model=False, cv=5):
         self.export_model = export_model
         self.cv = cv
 
-    def train(self, X_train, y_train, feature_names=None):
+    def train(self, X_train, y_train, feature_names=None, category_indicator=None):
         """Train the model and return the training score.
 
         Args:
             X_train (array): array of training vector.
             y_train (array): array of target vector.
             feature_names (list): list of names of the features in X_train
+            category_indicator (list): list of boolean indicating whether a feature is a categorical variable.
 
         Returns:
             list: list of metric scores evaluated on the training data.
@@ -135,13 +138,14 @@ def train(self, X_train, y_train, feature_names=None):
 
             return [scores_dict["test_" + key] for key in self.metrics]
 
-    def test(self, X_test, y_test, feature_names=None):
+    def test(self, X_test, y_test, feature_names=None, category_indicator=None):
         """Evaluate the model and return the test score.
 
         Args:
             X_test (array): array of training vector.
             y_test (array): array of target vector.
             feature_names (list): list of names of the features in X_test
+            category_indicator (list): list of boolean indicating whether a feature is a categorical variable.
 
         Returns:
             list: list of metric scores evaluated on the testing data.
diff --git a/requirements.txt b/requirements.txt
index 3486629..83f43a5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+openml>=0.11.0
 pmlb>=1.0
 pandas>=1.0.3
 scikit-learn>=0.22.2
diff --git a/tests/test_analysis.py b/tests/test_analysis.py
index df3e8c7..7fd5c01 100644
--- a/tests/test_analysis.py
+++ b/tests/test_analysis.py
@@ -60,7 +60,7 @@ def test_string(self, regressor, tmp_path):
         )
         assert len(an.datasets) == 5
 
-    def test_list(self, regressor, tmp_path):
+    def test_pmlb_list(self, regressor, tmp_path):
         # should work with a list of valid pmlb datasets names
         an = Analysis(
             methods=[("dummy", regressor)],
@@ -83,16 +83,34 @@ def test_list(self, regressor, tmp_path):
                 local_cache_dir=PMLB_CACHE,
             )
 
+    def test_openml_list(self, regressor, tmp_path):
+        # should work with a list of valid openml datasets names
+        an = Analysis(
+            methods=[("dummy", regressor)],
+            metric_names=["r2", "max_error"],
+            datasets=[
+                "wind",
+                "cpu_small",
+                1030,
+            ],  # sometimes failes due to issues with busy openml servers
+            data_source="openml",
+            random_state=SEED,
+            output_dir=tmp_path,
+            local_cache_dir=PMLB_CACHE,
+        )
+        assert len(an.datasets) == 3
+        an.run()
+
     def test_tuple(self, regressor, tmp_path):
         # should works with a list of (X, y) tuples
         datasets = [
             (
                 "data_1",
-                *make_regression(n_samples=200, n_features=5, random_state=SEED),
+                make_regression(n_samples=200, n_features=5, random_state=SEED),
             ),
             (
                 "data_2",
-                *make_regression(n_samples=1000, n_features=50, random_state=SEED),
+                make_regression(n_samples=1000, n_features=50, random_state=SEED),
             ),
         ]
         an = Analysis(
@@ -110,16 +128,16 @@ def test_tuple_train_test(self, regressor, tmp_path):
         datasets = [
             (
                 "data_1",
-                *make_regression(n_samples=200, n_features=5, random_state=SEED),
+                make_regression(n_samples=200, n_features=5, random_state=SEED),
             ),
             (
                 "data_2",
-                *make_regression(n_samples=1000, n_features=50, random_state=SEED),
+                make_regression(n_samples=1000, n_features=50, random_state=SEED),
             ),
         ]
 
         def test_split(data):
-            name, X, y = data
+            name, (X, y) = data
             X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=SEED)
             return name, (X_train, y_train), (X_test, y_test)
 
@@ -135,27 +153,25 @@ def test_split(data):
         )
         assert len(an.datasets) == 2
 
-    def test_mixed(self, regressor, tmp_path):
+    def test_pmlb_mixed(self, regressor, tmp_path):
         # should work with a mix of strings, tuples, tuple of tuples
         datasets = [
             (
                 "data_1",
-                *make_regression(n_samples=200, n_features=5, random_state=SEED),
+                make_regression(n_samples=200, n_features=5, random_state=SEED),
             ),
             (
                 "data_2",
-                *make_regression(n_samples=1000, n_features=50, random_state=SEED),
+                make_regression(n_samples=1000, n_features=50, random_state=SEED),
             ),
         ]
 
         def test_split(data):
-            name, X, y = data
+            name, (X, y) = data
             X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=SEED)
             return name, (X_train, y_train), (X_test, y_test)
 
-        datasets = (
-            list(map(test_split, datasets)) + datasets + ["adult", "cars", "pima"]
-        )
+        datasets = list(map(test_split, datasets)) + datasets + ["adult"]
 
         an = Analysis(
             methods=[("dummy", regressor)],
@@ -164,8 +180,10 @@ def test_split(data):
             random_state=SEED,
             output_dir=tmp_path,
             local_cache_dir=PMLB_CACHE,
+            use_test_set=True,
         )
-        assert len(an.datasets) == 7
+        assert len(an.datasets) == 5
+        an.run()
 
 
 # Test dropna
@@ -176,7 +194,7 @@ def __init__(self, dropna):
         self.drop_na = dropna
         super().__init__()
 
-    def train(self, X, y, feature_names):
+    def train(self, X, y, feature_names, category_indicator=None):
         dat = np.hstack([X, y.reshape(-1, 1)])
         if self.drop_na:
             assert not np.isnan(dat).any(), "data has missing values"