From f7a64e69c490be1f45463a55bbf4f9cbae453844 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francesc=20Mart=C3=AD=20Escofet?= <154450563+FrancescMartiEscofetQC@users.noreply.github.com> Date: Wed, 10 Jul 2024 15:43:31 +0200 Subject: [PATCH] Fix numpy - pandas conversion (#57) --- docs/examples/example_lime.ipynb | 41 +++++++++++--------------------- 1 file changed, 14 insertions(+), 27 deletions(-) diff --git a/docs/examples/example_lime.ipynb b/docs/examples/example_lime.ipynb index d3c97ee..e967df8 100644 --- a/docs/examples/example_lime.ipynb +++ b/docs/examples/example_lime.ipynb @@ -217,10 +217,10 @@ "source": [ "### Generating lime plots\n", "\n", - "``lime`` will expect a function which consumes an ``X`` and returns\n", + "``lime`` will expect a function which consumes a ``np.ndarray`` ``X`` and returns\n", "a one-dimensional vector of the same length as ``X``. We'll have to\n", "adapt the {meth}`~metalearners.rlearner.RLearner.predict` method of\n", - "our {class}`~metalearners.rlearner.RLearner` in two ways:\n", + "our {class}`~metalearners.rlearner.RLearner` in three ways:\n", "\n", "* We need to pass a value for the necessary parameter ``is_oos`` to {meth}`~metalearners.rlearner.RLearner.predict`.\n", "\n", @@ -228,6 +228,10 @@ " {meth}`~metalearners.rlearner.RLearner.predict` to be one-dimensional. This\n", " we can easily achieve via {func}`metalearners.utils.simplify_output`.\n", "\n", + "* We need to reconvert the ``np.ndarray`` to a ``pd.DataFrame`` to work with categoricals\n", + " and specify the correct categories so the categorical codes are the same (which are used internally in LightGBM),\n", + " see [this issue](https://github.com/microsoft/LightGBM/issues/5162) for more context.\n", + "\n", "This we can do as follows:" ] }, @@ -244,7 +248,11 @@ "from metalearners.utils import simplify_output\n", "\n", "def predict(X):\n", - " return simplify_output(rlearner.predict(X, is_oos=True))" + " X_pd = pd.DataFrame(X, copy=True)\n", + " for c in X_pd.columns:\n", + " # This line sets the cat.categories correctly (even if not all are present in X)\n", + " X_pd[c] = X_pd[c].astype(df[feature_columns].iloc[:, c].dtype)\n", + " return simplify_output(rlearner.predict(X_pd, is_oos=True))" ] }, { @@ -254,26 +262,7 @@ "where we set ``is_oos=True`` since ``lime`` will call\n", "{meth}`~metalearners.rlearner.RLearner.predict`\n", "with various inputs which will not be able to be recognized as\n", - "in-sample data.\n", - "\n", - "Since ``lime`` expects ``numpy`` datastructures, we'll have to\n", - "manually encode the categorical features of our ``pandas`` data\n", - "structure, see [this issue](https://github.com/microsoft/LightGBM/issues/5162) for more context." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "vscode": { - "languageId": "plaintext" - } - }, - "outputs": [], - "source": [ - "X = df[feature_columns].copy()\n", - "for categorical_feature_column in categorical_feature_columns:\n", - " X[categorical_feature_column] = X[categorical_feature_column].cat.codes" + "in-sample data." ] }, { @@ -332,10 +321,8 @@ "from lime.lime_tabular import LimeTabularExplainer\n", "from lime.submodular_pick import SubmodularPick\n", "\n", - "X = X.to_numpy()\n", - "\n", "explainer = LimeTabularExplainer(\n", - " X,\n", + " df[feature_columns].to_numpy(),\n", " feature_names=feature_columns,\n", " categorical_features=categorical_feature_indices,\n", " categorical_names=categorical_names,\n", @@ -345,7 +332,7 @@ ")\n", "\n", "sp = SubmodularPick(\n", - " data=X,\n", + " data=df[feature_columns].to_numpy(),\n", " explainer=explainer,\n", " predict_fn=predict,\n", " method=\"sample\",\n",