From d9c1ec7010f2915f286a6036b72ca91a46425318 Mon Sep 17 00:00:00 2001
From: Vinay D <vinayd@nvidia.com>
Date: Tue, 5 Sep 2023 17:28:16 +0530
Subject: [PATCH] Adding a bunch of datasets for benchmarking

---
 python/cuml/benchmark/algorithms.py |   4 +-
 python/cuml/benchmark/datagen.py    | 288 +++++++++++++++++++++++-----
 python/cuml/benchmark/runners.py    |  18 +-
 3 files changed, 258 insertions(+), 52 deletions(-)

diff --git a/python/cuml/benchmark/algorithms.py b/python/cuml/benchmark/algorithms.py
index 863ab99134..79214787c7 100644
--- a/python/cuml/benchmark/algorithms.py
+++ b/python/cuml/benchmark/algorithms.py
@@ -337,7 +337,7 @@ def all_algorithms():
         AlgorithmPair(
             sklearn.ensemble.RandomForestClassifier,
             cuml.ensemble.RandomForestClassifier,
-            shared_args={"max_features": "sqrt", "n_estimators": 50},
+            shared_args={},
             cpu_args={"n_jobs": -1},
             name="RandomForestClassifier",
             accepts_labels=True,
@@ -348,7 +348,7 @@ def all_algorithms():
         AlgorithmPair(
             sklearn.ensemble.RandomForestRegressor,
             cuml.ensemble.RandomForestRegressor,
-            shared_args={"max_features": 1.0, "n_estimators": 50},
+            shared_args={},
             cpu_args={"n_jobs": -1},
             name="RandomForestRegressor",
             accepts_labels=True,
diff --git a/python/cuml/benchmark/datagen.py b/python/cuml/benchmark/datagen.py
index 54567a4ea8..2f49ca292e 100644
--- a/python/cuml/benchmark/datagen.py
+++ b/python/cuml/benchmark/datagen.py
@@ -39,6 +39,7 @@
 from cuml.internals import input_utils
 from urllib.request import urlretrieve
 import sklearn.model_selection
+from sklearn.datasets import load_svmlight_file, fetch_covtype
 import cuml.datasets
 from cuml.internals.safe_imports import cpu_only_import
 import os
@@ -71,7 +72,10 @@ def _gen_data_regression(
         dtype=dtype,
     )
 
-    return X_arr, y_arr
+    X_df = cudf.DataFrame(X_arr)
+    y_df = cudf.Series(y_arr)
+
+    return X_df, y_df
 
 
 def _gen_data_blobs(
@@ -117,68 +121,201 @@ def _gen_data_classification(
         random_state=random_state,
         dtype=dtype,
     )
+    X_df = cudf.DataFrame(X_arr)
+    y_df = cudf.Series(y_arr)
+    return X_df, y_df
 
-    return X_arr, y_arr
 
+# Default location to cache datasets
+DATASETS_DIRECTORY = "."
 
-def _gen_data_higgs(n_samples=None, n_features=None, dtype=np.float32):
-    """Wrapper returning Higgs in Pandas format"""
-    X_df, y_df = load_higgs()
-    if n_samples == 0:
-        n_samples = X_df.shape[0]
-    if n_features == 0:
-        n_features = X_df.shape[1]
-    if n_features > X_df.shape[1]:
+
+def _gen_data_airline_regression(datasets_root_dir):
+
+    url = "http://kt.ijs.si/elena_ikonomovska/datasets/airline/airline_14col.data.bz2"
+
+    local_url = os.path.join(datasets_root_dir, os.path.basename(url))
+
+    cols = [
+        "Year",
+        "Month",
+        "DayofMonth",
+        "DayofWeek",
+        "CRSDepTime",
+        "CRSArrTime",
+        "UniqueCarrier",
+        "FlightNum",
+        "ActualElapsedTime",
+        "Origin",
+        "Dest",
+        "Distance",
+        "Diverted",
+        "ArrDelay",
+    ]
+    dtype = np.float64
+    dtype_columns = {
+        "Year": dtype,
+        "Month": dtype,
+        "DayofMonth": dtype,
+        "DayofWeek": dtype,
+        "CRSDepTime": dtype,
+        "CRSArrTime": dtype,
+        "FlightNum": dtype,
+        "ActualElapsedTime": dtype,
+        "Distance": dtype,
+        "Diverted": dtype,
+        "ArrDelay": dtype,
+    }
+
+    if not os.path.isfile(local_url):
+        urlretrieve(url, local_url)
+    df = pd.read_csv(local_url, names=cols, dtype=dtype_columns)
+
+    # Encode categoricals as numeric
+    for col in df.select_dtypes(["object"]).columns:
+        df[col] = df[col].astype("category").cat.codes
+
+    X = df[df.columns.difference(["ArrDelay"])]
+    y = df["ArrDelay"]
+
+    return X, y
+
+
+def _gen_data_airline_classification(datasets_root_dir):
+    X, y = _gen_data_airline_regression(datasets_root_dir)
+    y = 1 * (y > 0)
+    return X, y
+
+
+def _gen_data_bosch(datasets_root_dir):
+
+    local_url = os.path.join(datasets_root_dir, "train_numeric.csv.zip")
+
+    if not os.path.isfile(local_url):
         raise ValueError(
-            "Higgs dataset has only %d features, cannot support %d"
-            % (X_df.shape[1], n_features)
+            "Bosch dataset not found (search path: %s)" % local_url
         )
-    if n_samples > X_df.shape[0]:
+
+    df = pd.read_csv(
+        local_url, index_col=0, compression="zip", dtype=np.float32
+    )
+
+    X = df.iloc[:, :-1]
+    y = df.iloc[:, -1]
+
+    return X, y
+
+
+def _gen_data_covtype(datasets_root_dir):
+
+    X, y = fetch_covtype(return_X_y=True)
+    # Labele range in covtype start from 1, making it start from 0
+    y = y - 1
+
+    X = pd.DataFrame(X)
+    y = pd.Series(y)
+
+    return X, y
+
+
+def _gen_data_epsilon(datasets_root_dir):
+
+    url_train = (
+        "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary"
+        "/epsilon_normalized.bz2"
+    )
+    url_test = (
+        "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary"
+        "/epsilon_normalized.t.bz2"
+    )
+
+    local_url_train = os.path.join(
+        datasets_root_dir, os.path.basename(url_train)
+    )
+    local_url_test = os.path.join(
+        datasets_root_dir, os.path.basename(url_test)
+    )
+
+    if not os.path.isfile(local_url_train):
+        urlretrieve(url_train, local_url_train)
+    if not os.path.isfile(local_url_test):
+        urlretrieve(url_test, local_url_test)
+
+    X_train, y_train = load_svmlight_file(local_url_train, dtype=np.float32)
+    X_test, y_test = load_svmlight_file(local_url_test, dtype=np.float32)
+
+    X_train = pd.DataFrame(X_train.toarray())
+    X_test = pd.DataFrame(X_test.toarray())
+
+    y_train[y_train <= 0] = 0
+    y_test[y_test <= 0] = 0
+    y_train = pd.Series(y_train)
+    y_test = pd.Series(y_test)
+
+    X = pd.concat([X_train, X_test], ignore_index=True)
+    y = pd.concat([y_train, y_test], ignore_index=True)
+
+    return X, y
+
+
+def _gen_data_fraud(datasets_root_dir):
+
+    local_url = os.path.join(datasets_root_dir, "creditcard.csv.zip")
+
+    if not os.path.isfile(local_url):
         raise ValueError(
-            "Higgs dataset has only %d rows, cannot support %d"
-            % (X_df.shape[0], n_samples)
+            "Fraud dataset not found (search path: %s)" % local_url
         )
-    return X_df.iloc[:n_samples, :n_features].astype(dtype), y_df.iloc[
-        :n_samples
-    ].astype(dtype)
-
 
-def _download_and_cache(url, compressed_filepath, decompressed_filepath):
-    if not os.path.isfile(compressed_filepath):
-        urlretrieve(url, compressed_filepath)
-    if not os.path.isfile(decompressed_filepath):
-        cf = gzip.GzipFile(compressed_filepath)
-        with open(decompressed_filepath, "wb") as df:
-            df.write(cf.read())
-    return decompressed_filepath
+    df = pd.read_csv(local_url, dtype=np.float32)
+    X = df[[col for col in df.columns if col.startswith("V")]]
+    y = df["Class"]
 
+    return X, y
 
-# Default location to cache datasets
-DATASETS_DIRECTORY = "."
 
+def _gen_data_higgs(datasets_root_dir):
 
-def load_higgs():
-    """Returns the Higgs Boson dataset as an X, y tuple of dataframes."""
     higgs_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz"  # noqa
-    decompressed_filepath = _download_and_cache(
-        higgs_url,
-        os.path.join(DATASETS_DIRECTORY, "HIGGS.csv.gz"),
-        os.path.join(DATASETS_DIRECTORY, "HIGGS.csv"),
-    )
+
+    local_url = os.path.join(datasets_root_dir, os.path.basename(higgs_url))
+
+    if not os.path.isfile(local_url):
+        urlretrieve(higgs_url, local_url)
+
     col_names = ["label"] + [
         "col-{}".format(i) for i in range(2, 30)
     ]  # Assign column names
     dtypes_ls = [np.int32] + [
         np.float32 for _ in range(2, 30)
     ]  # Assign dtypes to each column
-    data_df = pd.read_csv(
-        decompressed_filepath,
+
+    df = pd.read_csv(
+        local_url,
         names=col_names,
         dtype={k: v for k, v in zip(col_names, dtypes_ls)},
     )
-    X_df = data_df[data_df.columns.difference(["label"])]
-    y_df = data_df["label"]
-    return cudf.DataFrame.from_pandas(X_df), cudf.Series.from_pandas(y_df)
+
+    X = df[df.columns.difference(["label"])]
+    y = df["label"]
+
+    return X, y
+
+
+def _gen_data_year(datasets_root_dir):
+
+    year_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00203/YearPredictionMSD.txt.zip"
+
+    local_url = os.path.join(datasets_root_dir, "YearPredictionMSD.txt.zip")
+
+    if not os.path.isfile(local_url):
+        urlretrieve(year_url, local_url)
+
+    df = pd.read_csv(local_url, header=None)
+    X = df.iloc[:, 1:]
+    y = df.iloc[:, 0]
+
+    return X, y
 
 
 def _convert_to_numpy(data):
@@ -346,8 +483,16 @@ def _convert_to_scipy_sparse_csc(data):
     "zeros": _gen_data_zeros,
     "classification": _gen_data_classification,
     "regression": _gen_data_regression,
+    "airline_regression": _gen_data_airline_regression,
+    "airline_classification": _gen_data_airline_classification,
+    "bosch": _gen_data_bosch,
+    "covtype": _gen_data_covtype,
+    "epsilon": _gen_data_epsilon,
+    "fraud": _gen_data_fraud,
     "higgs": _gen_data_higgs,
+    "year": _gen_data_year,
 }
+
 _data_converters = {
     "numpy": _convert_to_numpy,
     "cupy": _convert_to_cupy,
@@ -371,6 +516,8 @@ def gen_data(
     n_samples=0,
     n_features=0,
     test_fraction=0.0,
+    datasets_root_dir=DATASETS_DIRECTORY,
+    dtype=np.float32,
     **kwargs,
 ):
     """Returns a tuple of data from the specified generator.
@@ -383,7 +530,7 @@ def gen_data(
     dataset_format : str
         Type of data to return. (One of cudf, numpy, pandas, gpuarray)
     n_samples : int
-        Number of samples to include in training set (regardless of test split)
+        Total number of samples to loaded including training and testing samples
     test_fraction : float
         Fraction of the dataset to partition randomly into the test set.
         If this is 0.0, no test set will be created.
@@ -394,12 +541,56 @@ def gen_data(
         containing matrices or dataframes of the requested format.
         test_features and test_labels may be None if no splitting was done.
     """
-    data = _data_generators[dataset_name](
-        int(n_samples / (1 - test_fraction)), n_features, **kwargs
+
+    pickle_x_file_url = os.path.join(
+        datasets_root_dir, "%s_x.pkl" % dataset_name
     )
-    if test_fraction != 0.0:
+    pickle_y_file_url = os.path.join(
+        datasets_root_dir, "%s_y.pkl" % dataset_name
+    )
+
+    mock_datasets = ["regression", "classification", "blobs", "zero"]
+    if dataset_name in mock_datasets:
+        X_df, y_df = _data_generators[dataset_name](
+            n_samples=n_samples, n_features=n_features, dtype=dtype, **kwargs
+        )
+    else:
+        if os.path.isfile(pickle_x_file_url):
+            # loading data from cache
+            X = pd.read_pickle(pickle_x_file_url)
+            y = pd.read_pickle(pickle_y_file_url)
+        else:
+            X, y = _data_generators[dataset_name](datasets_root_dir, **kwargs)
+
+            # cache the dataset for future use
+            X.to_pickle(pickle_x_file_url)
+            y.to_pickle(pickle_y_file_url)
+
+        if n_samples > X.shape[0]:
+            raise ValueError(
+                "%s dataset has only %d rows, cannot support %d"
+                % (dataset_name, X.shape[0], n_samples)
+            )
+
+        if n_features > X.shape[1]:
+            raise ValueError(
+                "%s dataset has only %d features, cannot support %d"
+                % (dataset_name, X.shape[1], n_features)
+            )
+
         if n_samples == 0:
-            n_samples = int(data[0].shape[0] * (1 - test_fraction))
+            n_samples = X.shape[0]
+
+        if n_features == 0:
+            n_features = X.shape[1]
+
+        X_df = cudf.DataFrame.from_pandas(
+            X.iloc[0:n_samples, 0:n_features].astype(dtype)
+        )
+        y_df = cudf.Series.from_pandas(y.iloc[0:n_samples].astype(dtype))
+
+    data = (X_df, y_df)
+    if test_fraction != 0.0:
         random_state_dict = (
             {"random_state": kwargs["random_state"]}
             if "random_state" in kwargs
@@ -407,12 +598,13 @@ def gen_data(
         )
         X_train, X_test, y_train, y_test = tuple(
             sklearn.model_selection.train_test_split(
-                *data, train_size=n_samples, **random_state_dict
+                *data,
+                test_size=int(n_samples * test_fraction),
+                **random_state_dict,
             )
         )
         data = (X_train, y_train, X_test, y_test)
     else:
         data = (*data, None, None)  # No test set
-
     data = _data_converters[dataset_format](data)
     return data
diff --git a/python/cuml/benchmark/runners.py b/python/cuml/benchmark/runners.py
index 505bb9defb..5ea69c105b 100644
--- a/python/cuml/benchmark/runners.py
+++ b/python/cuml/benchmark/runners.py
@@ -149,6 +149,14 @@ def _run_one_size(
                 )
             )
 
+        if n_samples == 0:
+            # Update n_samples = training samples + testing samples
+            n_samples = data[0].shape[0] + data[2].shape[0]
+
+        if n_features == 0:
+            # Update n_features
+            n_features = data[0].shape[1]
+
         return dict(
             cuml_time=cu_elapsed,
             cpu_time=cpu_elapsed,
@@ -291,8 +299,6 @@ def _run_one_size(
             for rep in cpu_timer.benchmark_runs():
                 cpu_model = algo_pair.run_cpu(
                     data,
-                    **param_overrides,
-                    **cpu_param_overrides,
                     **setup_override,
                 )
             cpu_elapsed = np.min(cpu_timer.timings)
@@ -312,6 +318,14 @@ def _run_one_size(
         else:
             cpu_elapsed = 0.0
 
+        if n_samples == 0:
+            # Update n_samples = training samples + testing samples
+            n_samples = data[0].shape[0] + data[2].shape[0]
+
+        if n_features == 0:
+            # Update n_features
+            n_features = data[0].shape[1]
+
         return dict(
             cuml_time=cu_elapsed,
             cpu_time=cpu_elapsed,