Merge pull request #81 from orchardbirds/80-rmspe

80 rmspe
orchardbirds · Jun 30, 2021 · 69f248a · 69f248a
2 parents 9dc707f + 5a84f6b
commit 69f248a
Show file tree

Hide file tree

Showing 6 changed files with 168 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -16,6 +16,7 @@ Main features:
 - [Weighted Cross Entropy](https://orchardbirds.github.io/bokbokbok/tutorials/weighted_cross_entropy.html)
 - [Weighted Focal Loss](https://orchardbirds.github.io/bokbokbok/tutorials/focal_loss.html)
 - [Log Cosh Loss](https://orchardbirds.github.io/bokbokbok/tutorials/log_cosh_loss.html)
+- [Log Cosh Loss](https://orchardbirds.github.io/bokbokbok/tutorials/RMSPE.html)
 - [F1 score](https://orchardbirds.github.io/bokbokbok/tutorials/F1_score.html)
 - [Quadratic Weighted Kappa](https://orchardbirds.github.io/bokbokbok/tutorials/quadratic_weighted_kappa.html)
 

diff --git a/bokbokbok/eval_metrics/regression/__init__.py b/bokbokbok/eval_metrics/regression/__init__.py
@@ -3,8 +3,10 @@
 
 from .regression_eval_metrics import (
     LogCoshMetric,
+    RMSPEMetric,
 )
 
 __all__ = [
-    "LogCoshMetric"
+    "LogCoshMetric",
+    "RMSPEMetric",
 ]
diff --git a/bokbokbok/eval_metrics/regression/regression_eval_metrics.py b/bokbokbok/eval_metrics/regression/regression_eval_metrics.py
@@ -28,3 +28,34 @@ def log_cosh_error(yhat, dtrain, XGBoost=XGBoost):
             return 'LogCosh', float(np.sum(elements) / len(y)), False
 
     return log_cosh_error
+
+
+def RMSPEMetric(XGBoost=False):
+    """
+    Calculates the Root Mean Squared Percentage Error:
+    https://www.kaggle.com/c/optiver-realized-volatility-prediction/overview/evaluation
+
+    There is no loss function for this as the gradient is constant, meaning the Hessian is equal to 0.
+    Args:
+        XGBoost (Bool): Set to True if using XGBoost. We assume LightGBM as default use.
+                        Note that you should also set `maximize=False` in the XGBoost train function
+
+    """
+    def RMSPE(yhat, dtrain, XGBoost=XGBoost):
+        """
+        Root Mean Squared Log Error.
+        All input labels are required to be greater than -1.
+
+        yhat: Predictions
+        dtrain: The XGBoost / LightGBM dataset
+        XGBoost (Bool): If XGBoost is to be implemented
+        """
+
+        y = dtrain.get_label()
+        elements = ((y - yhat) / y) ** 2
+        if XGBoost:
+            return 'RMSPE', float(np.sqrt(np.sum(elements) / len(y)))
+        else:
+            return 'RMSPE', float(np.sqrt(np.sum(elements) / len(y))), False
+
+    return RMSPE
diff --git a/docs/tutorials/RMSPE.ipynb b/docs/tutorials/RMSPE.ipynb
@@ -0,0 +1,131 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### When to use Root Mean Squared Percentage Error?\n",
+    "\n",
+    "This function is defined according to [this Kaggle competition](https://www.kaggle.com/c/optiver-realized-volatility-prediction/overview/evaluation) for volatility calculation. \n",
+    "\n",
+    "This function cannot be used as a Loss function - the gradient is constant and hence the Hessian is 0. Nevertheless, it can still be used as an evaluation metric as the model trains."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.datasets import make_regression\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import mean_absolute_error\n",
+    "from bokbokbok.eval_metrics.regression import RMSPEMetric\n",
+    "\n",
+    "X, y = make_regression(n_samples=1000, \n",
+    "                       n_features=10, \n",
+    "                       random_state=41114)\n",
+    "\n",
+    "X_train, X_valid, y_train, y_valid = train_test_split(X, \n",
+    "                                                      y/100, \n",
+    "                                                      test_size=0.25, \n",
+    "                                                      random_state=41114)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Usage in LightGBM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import lightgbm as lgb\n",
+    "\n",
+    "train = lgb.Dataset(X_train, y_train)\n",
+    "valid = lgb.Dataset(X_valid, y_valid, reference=train)\n",
+    "params = {\n",
+    "     'n_estimators': 3000,\n",
+    "     'seed': 41114,\n",
+    "     'n_jobs': 8,\n",
+    "     'num_leaves': 10,\n",
+    "     'learning_rate': 0.1,\n",
+    "     'verbose': 10,\n",
+    "     #'objective': 'RMSE',\n",
+    "   }\n",
+    "\n",
+    "clf = lgb.train(params=params,\n",
+    "                train_set=train,\n",
+    "                valid_sets=[train, valid],\n",
+    "                valid_names=['train','valid'],\n",
+    "                feval=RMSPEMetric(),\n",
+    "                early_stopping_rounds=3000,\n",
+    "                verbose_eval=1)\n",
+    "\n",
+    "mean_absolute_error(y_valid, clf.predict(X_valid))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Usage in XGBoost"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import xgboost as xgb\n",
+    "\n",
+    "dtrain = xgb.DMatrix(X_train, y_train)\n",
+    "dvalid = xgb.DMatrix(X_valid, y_valid)\n",
+    "\n",
+    "params = {\n",
+    "     'seed': 41114,\n",
+    "     'learning_rate': 0.1,\n",
+    "    'disable_default_eval_metric': 1\n",
+    "   }\n",
+    "\n",
+    "bst = xgb.train(params,\n",
+    "          dtrain=dtrain,\n",
+    "          num_boost_round=3000,\n",
+    "          early_stopping_rounds=100,\n",
+    "          verbose_eval=100,\n",
+    "          maximize=False,\n",
+    "          feval=RMSPEMetric(XGBoost=True),\n",
+    "          evals=[(dtrain, 'dtrain'), (dvalid, 'dvalid')])\n",
+    "\n",
+    "mean_absolute_error(y_valid, bst.predict(dvalid))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [conda env:skorecard_py37] *",
+   "language": "python",
+   "name": "conda-env-skorecard_py37-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -16,6 +16,7 @@ nav:
       - Use Weighted Focal Loss: tutorials/focal_loss.ipynb
       - Use F1 Score: tutorials/F1_score.ipynb
       - Use Log Cosh Score: tutorials/log_cosh_loss.ipynb
+      - Use Root Mean Squared Percentage Error: tutorials/RMSPE.ipynb
       - Use Quadratic Weighted Kappa: tutorials/quadratic_weighted_kappa.ipynb
   - Derivations:
       - A Note About Gradients in Classification Problems: derivations/note.md

diff --git a/setup.py b/setup.py
@@ -35,7 +35,7 @@
 
 setup(
     name="bokbokbok",
-    version="0.3",
+    version="0.4",
     description="Custom Losses and Metrics for XGBoost, LightGBM, CatBoost",
     long_description=long_description,
     long_description_content_type="text/markdown",