From e2adaf4e91799a02f95e9af55b8c5aae8279146e Mon Sep 17 00:00:00 2001 From: Hector Lira Date: Wed, 8 Feb 2023 13:17:52 -0600 Subject: [PATCH 01/12] Add test notebook --- src/fklearn/causal/validation/ci_test.ipynb | 677 ++++++++++++++++++++ 1 file changed, 677 insertions(+) create mode 100644 src/fklearn/causal/validation/ci_test.ipynb diff --git a/src/fklearn/causal/validation/ci_test.ipynb b/src/fklearn/causal/validation/ci_test.ipynb new file mode 100644 index 00000000..f7d0ff27 --- /dev/null +++ b/src/fklearn/causal/validation/ci_test.ipynb @@ -0,0 +1,677 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from fklearn.causal.effects import linear_effect\n", + "from fklearn.causal.validation.curves import cumulative_effect_curve" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from typing import List\n", + "\n", + "from toolz import curry\n", + "from fklearn.types import EffectFnType" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(dict(\n", + " t=[1, 1, 1, 2, 2, 2, 3, 3, 3],\n", + " x=[1, 2, 3, 1, 2, 3, 1, 2, 3],\n", + " y=[1, 1, 1, 2, 3, 4, 3, 5, 7],\n", + "))\n", + "\n", + "result = cumulative_effect_curve(df, prediction=\"x\", outcome=\"y\", treatment=\"t\", min_rows=3, steps=df.shape[0],\n", + " effect_fn=linear_effect)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([3. , 3. , 2.92857143, 2.5 , 2.5 ,\n", + " 2.46153846, 2. ])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [], + "source": [ + "# def linear_ci(df, y, t, z=1.96):\n", + "# n = df.shape[0]\n", + "# t_bar = df[t].mean()\n", + "# beta1 = linear_effect(df, y, t)\n", + "# beta0 = df[y].mean() - beta1 * t_bar\n", + "# e = df[y] - (beta0 + beta1*df[t])\n", + "# se = np.sqrt(((1/(n-2))*np.sum(e**2))/np.sum((df[t]-t_bar)**2))\n", + "# return np.array([beta1 - z*se, beta1 + z*se])\n", + "\n", + "def linear_ci(df, t, y, z=1.96):\n", + " n = df.shape[0]\n", + " t_bar = df[t].mean()\n", + " beta1 = linear_effect(df, t, y)\n", + " beta0 = df[y].mean() - beta1 * t_bar\n", + " e = df[y] - (beta0 + beta1*df[t])\n", + " se = np.sqrt(((1/(n-2))*np.sum(e**2))/np.sum((df[t]-t_bar)**2))\n", + " print(f\"\"\"\n", + " n: {n}\n", + " t_bar: {t_bar}\n", + " beta1: {beta1}\n", + " beta0: {beta0}\n", + " \"\"\")\n", + " return z*se" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " n: 3\n", + " t_bar: 2.0\n", + " beta1: 3.0\n", + " beta0: -2.0\n", + " \n" + ] + }, + { + "data": { + "text/plain": [ + "0.0" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linear_ci(ordered_df.head(3), \"t\", \"y\")" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
txy
2131
5234
8337
\n", + "
" + ], + "text/plain": [ + " t x y\n", + "2 1 3 1\n", + "5 2 3 4\n", + "8 3 3 7" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ordered_df = df.sort_values(by=\"x\", ascending=False)\n", + "ordered_df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "@curry\n", + "def effect_curves(\n", + " df: pd.DataFrame,\n", + " treatment: str,\n", + " outcome: str,\n", + " prediction: str,\n", + " min_rows: int = 30,\n", + " steps: int = 100,\n", + " effect_fn: EffectFnType = linear_effect,\n", + " ci_fn = None,\n", + ") -> pd.DataFrame:\n", + "\n", + " size: int = df.shape[0]\n", + " n_rows: List[int] = list(range(min_rows, size, size // steps)) + [size]\n", + "\n", + " cum_effect: np.ndarray = cumulative_effect_curve(\n", + " df=df,\n", + " treatment=treatment,\n", + " outcome=outcome,\n", + " prediction=prediction,\n", + " min_rows=min_rows,\n", + " steps=steps,\n", + " effect_fn=effect_fn,\n", + " )\n", + " ate: float = cum_effect[-1]\n", + " \n", + " effect_curves = pd.DataFrame({\"samples_count\": n_rows, \"cumulative_effect_curve\": cum_effect}).assign(\n", + " samples_fraction=lambda x: x[\"samples_count\"] / size,\n", + " cumulative_gain_curve=lambda x: x[\"samples_fraction\"] * x[\"cumulative_effect_curve\"],\n", + " random_model_cumulative_gain_curve=lambda x: x[\"samples_fraction\"] * ate,\n", + " relative_cumulative_gain_curve=lambda x: (\n", + " x[\"samples_fraction\"] * x[\"cumulative_effect_curve\"] - x[\"random_model_cumulative_gain_curve\"]\n", + " ),\n", + " )\n", + " \n", + " if ci_fn is not None:\n", + " \n", + " # crear un wrapper de linear_ci parecido a cumulative_effect_curve (wrapper) y effect_fn (similitud con linear_ci)\n", + " # el wrapper debe de solo arrojar un arreglo (igual que cumulative_effect_curve)\n", + " ordered_df = df.sort_values(prediction, ascending=False).reset_index(drop=True)\n", + " print(ordered_df.head(n_rows[0]), n_rows[0])\n", + " effect_errors = np.array([ci_fn(ordered_df.head(rows), treatment, outcome) for rows in n_rows])\n", + " \n", + " effect_curves = effect_curves.assign(\n", + " cumulative_effect_curve_error=effect_errors,\n", + " cumulative_gain_curve_error=lambda x: x[\"samples_fraction\"] * x[\"cumulative_effect_curve_error\"],\n", + " )\n", + "\n", + " return effect_curves\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(dict(\n", + " t=[1, 1, 1, 2, 2, 2, 3, 3, 3],\n", + " x=[1, 2, 3, 1, 2, 3, 1, 2, 3],\n", + " y=[1, 1, 1, 2, 3, 4, 3, 5, 7],\n", + "))" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " t x y\n", + "0 1 3 1\n", + "1 2 3 4\n", + "2 3 3 7 3\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
samples_countcumulative_effect_curvesamples_fractioncumulative_gain_curverandom_model_cumulative_gain_curverelative_cumulative_gain_curvecumulative_effect_curve_errorcumulative_gain_curve_error
033.0000000.3333331.0000000.6666670.3333330.0000000.000000
143.0000000.4444441.3333330.8888890.4444440.0000000.000000
252.9285710.5555561.6269841.1111110.5158730.5994440.333025
362.5000000.6666671.6666671.3333330.3333330.7747580.516505
472.5000000.7777781.9444441.5555560.3888890.6288550.489110
582.4615380.8888892.1880341.7777780.4102560.7654830.680429
692.0000001.0000002.0000002.0000000.0000000.9563820.956382
\n", + "
" + ], + "text/plain": [ + " samples_count cumulative_effect_curve samples_fraction \\\n", + "0 3 3.000000 0.333333 \n", + "1 4 3.000000 0.444444 \n", + "2 5 2.928571 0.555556 \n", + "3 6 2.500000 0.666667 \n", + "4 7 2.500000 0.777778 \n", + "5 8 2.461538 0.888889 \n", + "6 9 2.000000 1.000000 \n", + "\n", + " cumulative_gain_curve random_model_cumulative_gain_curve \\\n", + "0 1.000000 0.666667 \n", + "1 1.333333 0.888889 \n", + "2 1.626984 1.111111 \n", + "3 1.666667 1.333333 \n", + "4 1.944444 1.555556 \n", + "5 2.188034 1.777778 \n", + "6 2.000000 2.000000 \n", + "\n", + " relative_cumulative_gain_curve cumulative_effect_curve_error \\\n", + "0 0.333333 0.000000 \n", + "1 0.444444 0.000000 \n", + "2 0.515873 0.599444 \n", + "3 0.333333 0.774758 \n", + "4 0.388889 0.628855 \n", + "5 0.410256 0.765483 \n", + "6 0.000000 0.956382 \n", + "\n", + " cumulative_gain_curve_error \n", + "0 0.000000 \n", + "1 0.000000 \n", + "2 0.333025 \n", + "3 0.516505 \n", + "4 0.489110 \n", + "5 0.680429 \n", + "6 0.956382 " + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "effect_curves(\n", + " df=df,\n", + " treatment=\"t\",\n", + " outcome=\"y\",\n", + " prediction=\"x\",\n", + " min_rows = 3,\n", + " steps = df.shape[0],\n", + " effect_fn = linear_effect,\n", + " ci_fn = linear_ci,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
txy
0111
3212
6313
1121
4223
7325
2131
5234
8337
\n", + "
" + ], + "text/plain": [ + " t x y\n", + "0 1 1 1\n", + "3 2 1 2\n", + "6 3 1 3\n", + "1 1 2 1\n", + "4 2 2 3\n", + "7 3 2 5\n", + "2 1 3 1\n", + "5 2 3 4\n", + "8 3 3 7" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.sort_values(by=\"x\", ascending=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0. , 0. , 0.59944419, 0.77475803, 0.62885517,\n", + " 0.76548284, 0.95638207])" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cumulative_elast_curve_ci(\n", + " dataset=df,\n", + " prediction=\"x\",\n", + " y=\"y\",\n", + " t=\"t\",\n", + " min_periods=3,\n", + " steps=df.shape[0]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "linear_effect(df, \"y\", \"t\")" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "def elast_ci(df, y, t, z=1.96):\n", + " n = df.shape[0]\n", + " t_bar = df[t].mean()\n", + "# beta1 = elast(df, y, t)\n", + " beta1 = linear_effect(df, t, y)\n", + " beta0 = df[y].mean() - beta1 * t_bar\n", + " e = df[y] - (beta0 + beta1*df[t])\n", + " se = np.sqrt(((1/(n-2))*np.sum(e**2))/np.sum((df[t]-t_bar)**2))\n", + "# return np.array([beta1 - z*se, beta1 + z*se])\n", + " return z*se\n", + "\n", + "@curry\n", + "def elast(data, y, t):\n", + " # line coeficient for the one variable linear regression\n", + " return (np.sum((data[t] - data[t].mean())*(data[y] - data[y].mean())) /\n", + " np.sum((data[t] - data[t].mean())**2))" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "def cumulative_elast_curve_ci(dataset, prediction, y, t, min_periods=30, steps=100):\n", + " size = dataset.shape[0]\n", + " ordered_df = dataset.sort_values(prediction, ascending=False).reset_index(drop=True)\n", + " n_rows = list(range(min_periods, size, size // steps)) + [size]\n", + " \n", + " # just replacing a call to `elast` by a call to `elast_ci`\n", + " return np.array([elast_ci(ordered_df.head(rows), y, t) for rows in n_rows])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From ef19bd9ae8dfcdb20c7659aa27f4f6a5b26d5d0c Mon Sep 17 00:00:00 2001 From: Hector Lira Date: Tue, 21 Feb 2023 09:50:45 -0600 Subject: [PATCH 02/12] Modify prototype functions in notebook --- src/fklearn/causal/validation/ci_test.ipynb | 139 ++++++++++++-------- 1 file changed, 81 insertions(+), 58 deletions(-) diff --git a/src/fklearn/causal/validation/ci_test.ipynb b/src/fklearn/causal/validation/ci_test.ipynb index f7d0ff27..b8825c68 100644 --- a/src/fklearn/causal/validation/ci_test.ipynb +++ b/src/fklearn/causal/validation/ci_test.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -15,11 +15,11 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "from typing import List\n", + "from typing import Any, List\n", "\n", "from toolz import curry\n", "from fklearn.types import EffectFnType" @@ -27,7 +27,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -43,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -53,7 +53,7 @@ " 2.46153846, 2. ])" ] }, - "execution_count": 6, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -64,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -84,39 +84,21 @@ " beta0 = df[y].mean() - beta1 * t_bar\n", " e = df[y] - (beta0 + beta1*df[t])\n", " se = np.sqrt(((1/(n-2))*np.sum(e**2))/np.sum((df[t]-t_bar)**2))\n", - " print(f\"\"\"\n", - " n: {n}\n", - " t_bar: {t_bar}\n", - " beta1: {beta1}\n", - " beta0: {beta0}\n", - " \"\"\")\n", " return z*se" ] }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 10, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - " n: 3\n", - " t_bar: 2.0\n", - " beta1: 3.0\n", - " beta0: -2.0\n", - " \n" - ] - }, { "data": { "text/plain": [ "0.0" ] }, - "execution_count": 87, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -127,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -186,7 +168,7 @@ "8 3 3 7" ] }, - "execution_count": 82, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -198,14 +180,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "\n", + "def confidence_interval_curve(\n", + " df: pd.DataFrame,\n", + " treatment: str,\n", + " outcome: str,\n", + " prediction: str,\n", + " min_rows: int = 30,\n", + " steps: int = 100,\n", + " ci_fn: EffectFnType = linear_ci,\n", + " **kwargs,\n", + ") -> np.ndarray:\n", + " \n", + " size = df.shape[0]\n", + " ordered_df = df.sort_values(prediction, ascending=False).reset_index(drop=True)\n", + " n_rows = list(range(min_rows, size, size // steps)) + [size]\n", + " \n", + " return np.array([ci_fn(ordered_df.head(rows), treatment, outcome, **kwargs) for rows in n_rows])" + ] }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -218,7 +218,8 @@ " min_rows: int = 30,\n", " steps: int = 100,\n", " effect_fn: EffectFnType = linear_effect,\n", - " ci_fn = None,\n", + " ci_fn: EffectFnType = None,\n", + " **kwargs,\n", ") -> pd.DataFrame:\n", "\n", " size: int = df.shape[0]\n", @@ -246,11 +247,16 @@ " \n", " if ci_fn is not None:\n", " \n", - " # crear un wrapper de linear_ci parecido a cumulative_effect_curve (wrapper) y effect_fn (similitud con linear_ci)\n", - " # el wrapper debe de solo arrojar un arreglo (igual que cumulative_effect_curve)\n", - " ordered_df = df.sort_values(prediction, ascending=False).reset_index(drop=True)\n", - " print(ordered_df.head(n_rows[0]), n_rows[0])\n", - " effect_errors = np.array([ci_fn(ordered_df.head(rows), treatment, outcome) for rows in n_rows])\n", + " effect_errors: np.ndarray = confidence_interval_curve(\n", + " df=df,\n", + " treatment=treatment,\n", + " outcome=outcome,\n", + " prediction=prediction,\n", + " min_rows=min_rows,\n", + " steps=steps,\n", + " ci_fn=ci_fn,\n", + " **kwargs,\n", + " )\n", " \n", " effect_curves = effect_curves.assign(\n", " cumulative_effect_curve_error=effect_errors,\n", @@ -262,7 +268,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -275,19 +281,9 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 24, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " t x y\n", - "0 1 3 1\n", - "1 2 3 4\n", - "2 3 3 7 3\n" - ] - }, { "data": { "text/html": [ @@ -439,7 +435,7 @@ "6 0.956382 " ] }, - "execution_count": 81, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -454,6 +450,7 @@ " steps = df.shape[0],\n", " effect_fn = linear_effect,\n", " ci_fn = linear_ci,\n", + " z = 1.96\n", ")" ] }, @@ -571,7 +568,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -581,7 +578,7 @@ " 0.76548284, 0.95638207])" ] }, - "execution_count": 51, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -599,16 +596,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.35294117647058826" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "linear_effect(df, \"y\", \"t\")" ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -632,7 +640,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -650,7 +658,22 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "# siguientes tasks:\n", + "# - Agregar nuevos argumentos a las funciones y documentación de las funciones\n", + "# - Modificar cumulative effect curves con nuevos cambios\n", + "# - Crear archivo aparte de \"confidence intervals/errors\" (effects.py) y ahí poner linear_ci\n", + "# - Crear archivo aparte de \"curves\" (curves.py) y ahí poner confidence_interval_curve\n", + "# - Crear un nuevo tipo de variable (?, ErrorFnType) con el mismo signature de EffectFnType\n", + "# - Agregar tests con los ejemplos de este notebook\n", + "# - Modificar índice de la documentación\n", + "# - Abrir PR\n", + "# - agregar comentarios:\n", + "# 1. discusión de los tipos de las funciones: tienen el mismo signature, podemos pensar en algo más genérico?\n", + "# 2. confidence_interval_curve y cumulative_effect_curve hacen lo mismo, podemos pensar en algo más genérico?\n", + "# 3. cómo atacar la vulnerabilidad de que las curvas y los errores son arreglos que tienen que ser del mismo tamaño.\n", + "# Ahorita lo estamos hard-coding pero existirá alguna mejor manera de lidiar con esto?" + ] } ], "metadata": { From c5adc625cfd2e2ca83c6a64bdf28a9c5c633862c Mon Sep 17 00:00:00 2001 From: Hector Lira Date: Tue, 11 Apr 2023 09:27:22 -0600 Subject: [PATCH 03/12] Modify effect_curves to add confidenec intervals --- src/fklearn/causal/validation/ci_test.ipynb | 12 ++++----- src/fklearn/causal/validation/curves.py | 30 +++++++++++++++++++-- src/fklearn/types/types.py | 3 +++ 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/src/fklearn/causal/validation/ci_test.ipynb b/src/fklearn/causal/validation/ci_test.ipynb index b8825c68..60eff310 100644 --- a/src/fklearn/causal/validation/ci_test.ipynb +++ b/src/fklearn/causal/validation/ci_test.ipynb @@ -77,14 +77,14 @@ "# se = np.sqrt(((1/(n-2))*np.sum(e**2))/np.sum((df[t]-t_bar)**2))\n", "# return np.array([beta1 - z*se, beta1 + z*se])\n", "\n", - "def linear_ci(df, t, y, z=1.96):\n", + "def linear_ci(df, t, y):\n", " n = df.shape[0]\n", " t_bar = df[t].mean()\n", " beta1 = linear_effect(df, t, y)\n", " beta0 = df[y].mean() - beta1 * t_bar\n", " e = df[y] - (beta0 + beta1*df[t])\n", " se = np.sqrt(((1/(n-2))*np.sum(e**2))/np.sum((df[t]-t_bar)**2))\n", - " return z*se" + " return se" ] }, { @@ -192,7 +192,7 @@ " prediction: str,\n", " min_rows: int = 30,\n", " steps: int = 100,\n", - " ci_fn: EffectFnType = linear_ci,\n", + " error_fn: EffectFnType = linear_standard_error,\n", " **kwargs,\n", ") -> np.ndarray:\n", " \n", @@ -200,7 +200,7 @@ " ordered_df = df.sort_values(prediction, ascending=False).reset_index(drop=True)\n", " n_rows = list(range(min_rows, size, size // steps)) + [size]\n", " \n", - " return np.array([ci_fn(ordered_df.head(rows), treatment, outcome, **kwargs) for rows in n_rows])" + " return np.array([error_fn(ordered_df.head(rows), treatment, outcome, **kwargs) for rows in n_rows])" ] }, { @@ -218,7 +218,7 @@ " min_rows: int = 30,\n", " steps: int = 100,\n", " effect_fn: EffectFnType = linear_effect,\n", - " ci_fn: EffectFnType = None,\n", + " error_fn: EffectFnType = None,\n", " **kwargs,\n", ") -> pd.DataFrame:\n", "\n", @@ -254,7 +254,7 @@ " prediction=prediction,\n", " min_rows=min_rows,\n", " steps=steps,\n", - " ci_fn=ci_fn,\n", + " error_fn=error_fn,\n", " **kwargs,\n", " )\n", " \n", diff --git a/src/fklearn/causal/validation/curves.py b/src/fklearn/causal/validation/curves.py index f3852479..eb1bd03f 100644 --- a/src/fklearn/causal/validation/curves.py +++ b/src/fklearn/causal/validation/curves.py @@ -4,7 +4,7 @@ import pandas as pd from toolz import curry, partial -from fklearn.types import EffectFnType +from fklearn.types import EffectErrorFnType, EffectFnType from fklearn.causal.effects import linear_effect @@ -215,6 +215,7 @@ def effect_curves( min_rows: int = 30, steps: int = 100, effect_fn: EffectFnType = linear_effect, + error_fn: EffectErrorFnType = None, ) -> pd.DataFrame: """ Creates a dataset summarizing the effect curves: cumulative effect, cumulative gain and @@ -247,6 +248,11 @@ def effect_curves( A function that computes the treatment effect given a dataframe, the name of the treatment column and the name of the outcome column. + error_fn : function (df: pandas.DataFrame, treatment: str, outcome: str) -> float or Array of float + A function that computes the standard error given a dataframe, the name of the treatment column and the name + of the outcome column. Standard error must be multiplied by a quantile to get the upper and lower bounds of + a confidence interval. + Returns ---------- @@ -268,7 +274,7 @@ def effect_curves( ) ate: float = cum_effect[-1] - return pd.DataFrame({"samples_count": n_rows, "cumulative_effect_curve": cum_effect}).assign( + effect_curves_df = pd.DataFrame({"samples_count": n_rows, "cumulative_effect_curve": cum_effect}).assign( samples_fraction=lambda x: x["samples_count"] / size, cumulative_gain_curve=lambda x: x["samples_fraction"] * x["cumulative_effect_curve"], random_model_cumulative_gain_curve=lambda x: x["samples_fraction"] * ate, @@ -276,3 +282,23 @@ def effect_curves( x["samples_fraction"] * x["cumulative_effect_curve"] - x["random_model_cumulative_gain_curve"] ), ) + + if error_fn is not None: + + effect_errors: np.ndarray = confidence_interval_curve( + df=df, + treatment=treatment, + outcome=outcome, + prediction=prediction, + min_rows=min_rows, + steps=steps, + error_fn=error_fn, + **kwargs, + ) + + effect_curves_df = effect_curves_df.assign( + cumulative_effect_curve_error=effect_errors, + cumulative_gain_curve_error=lambda x: x["samples_fraction"] * x["cumulative_effect_curve_error"], + ) + + return effect_curves_df \ No newline at end of file diff --git a/src/fklearn/types/types.py b/src/fklearn/types/types.py index 31e9e3d7..4f775a90 100644 --- a/src/fklearn/types/types.py +++ b/src/fklearn/types/types.py @@ -41,3 +41,6 @@ # Effect Functions EffectFnType = Callable[[pd.DataFrame, str, str], float] + +# Effect Error Functions +EffectErrorFnType = Callable[[pd.DataFrame, str, str], float] \ No newline at end of file From fe3033caa8851a5e7be7966d08fff37abe29119d Mon Sep 17 00:00:00 2001 From: MarianaBlaz Date: Tue, 18 Apr 2023 09:17:37 -0600 Subject: [PATCH 04/12] adds the linear stadard error function on a new file for standard errors --- src/fklearn/causal/standard_errors.py | 36 +++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 src/fklearn/causal/standard_errors.py diff --git a/src/fklearn/causal/standard_errors.py b/src/fklearn/causal/standard_errors.py new file mode 100644 index 00000000..6e707839 --- /dev/null +++ b/src/fklearn/causal/standard_errors.py @@ -0,0 +1,36 @@ +import numpy as np +import pandas as pd +from fklearn.causal.effects import linear_effect + +def linear_standard_error(df: pd.DatFrame, treatment: str, outcome: str): + """ + Linear Standard Error + + Returns a Float: the linear standard error of a linear regression + of the outcome as a function of the treatment. + + Parameters + ---------- + + df : Pandas DataFrame + A Pandas' DataFrame with with treatment, outcome and confounder columns + + treatment : str + The name of the column in `df` with the treatment. + + outcome : str + The name of the column in `df` with the outcome. + + Returns + ---------- + se : Float + A Float of the linear standard error extracted by using the formula for + the simple linear regression. + """ + n = df.shape[0] + t_bar = df[treatment].mean() + beta1 = linear_effect(df, treatment, outcome) + beta0 = df[outcome].mean() - beta1 * t_bar + e = df[outcome] - (beta0 + beta1*df[treatment]) + se = np.sqrt(((1/(n-2))*np.sum(e**2))/np.sum((df[treatment]-t_bar)**2)) + return se \ No newline at end of file From 906bdf4cc50bc50a910b5a0ba4578b590273373a Mon Sep 17 00:00:00 2001 From: MarianaBlaz Date: Tue, 18 Apr 2023 09:38:15 -0600 Subject: [PATCH 05/12] changed name of file from standard_error to statistical_errors --- src/fklearn/causal/{standard_errors.py => statistical_errors.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename src/fklearn/causal/{standard_errors.py => statistical_errors.py} (100%) diff --git a/src/fklearn/causal/standard_errors.py b/src/fklearn/causal/statistical_errors.py similarity index 100% rename from src/fklearn/causal/standard_errors.py rename to src/fklearn/causal/statistical_errors.py From 863ebb0734825cb2b5386f0cce331222be8130bf Mon Sep 17 00:00:00 2001 From: MarianaBlaz Date: Tue, 18 Apr 2023 09:38:49 -0600 Subject: [PATCH 06/12] includes definition of cumulative error and adds it into the effect curves --- src/fklearn/causal/validation/curves.py | 69 ++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 6 deletions(-) diff --git a/src/fklearn/causal/validation/curves.py b/src/fklearn/causal/validation/curves.py index eb1bd03f..aa59f447 100644 --- a/src/fklearn/causal/validation/curves.py +++ b/src/fklearn/causal/validation/curves.py @@ -6,6 +6,7 @@ from fklearn.types import EffectErrorFnType, EffectFnType from fklearn.causal.effects import linear_effect +from fklearn.causal.statistical_errors import linear_standard_error @curry @@ -206,6 +207,58 @@ def relative_cumulative_gain_curve(df: pd.DataFrame, return np.array([(effect - ate) * (rows / size) for rows, effect in zip(n_rows, cum_effect)]) + +def cumulative_statistical_error_curve( + df: pd.DataFrame, + treatment: str, + outcome: str, + prediction: str, + min_rows: int = 30, + steps: int = 100, + error_fn: EffectFnType = linear_standard_error) -> np.ndarray: + + """ + Orders the dataset by prediction and computes the cumulative error curve according + to that ordering. The function to compute the error is given by error_fn. + + Parameters + ---------- + df : Pandas' DataFrame + A Pandas' DataFrame with target and prediction scores. + + treatment : Strings + The name of the treatment column in `df`. + + outcome : Strings + The name of the outcome column in `df`. + + prediction : Strings + The name of the prediction column in `df`. + + min_rows : Integer + Minimum number of observations needed to have a valid result. + + steps : Integer + The number of cumulative steps to iterate when accumulating the effect + + error_fn : function (df: pandas.DataFrame, treatment: str, outcome: str) -> float + A function that computes the statistical error of the regression of the treatment effect + over the outcome given a dataframe, the name of the treatment column and the name + of the outcome column. + + + Returns + ---------- + cumulative statistical error curve: Numpy's Array + The cumulative error according to the predictions ordering. + """ + + size = df.shape[0] + ordered_df = df.sort_values(prediction, ascending=False).reset_index(drop=True) + n_rows = list(range(min_rows, size, size // steps)) + [size] + + return np.array([error_fn(ordered_df.head(rows), treatment, outcome) for rows in n_rows]) + @curry def effect_curves( df: pd.DataFrame, @@ -224,6 +277,11 @@ def effect_curves( Moreover one column indicating the cumulative gain for a corresponding random model is also included as a benchmark. + It is also possible to include a cumulative error function by passing an error_fn, this + column is useful to include a confidence interval, which can be achieved by multiplying the + error column by a desired quantile. + + Parameters ---------- df : Pandas' DataFrame @@ -248,9 +306,9 @@ def effect_curves( A function that computes the treatment effect given a dataframe, the name of the treatment column and the name of the outcome column. - error_fn : function (df: pandas.DataFrame, treatment: str, outcome: str) -> float or Array of float - A function that computes the standard error given a dataframe, the name of the treatment column and the name - of the outcome column. Standard error must be multiplied by a quantile to get the upper and lower bounds of + error_fn : function (df: pandas.DataFrame, treatment: str, outcome: str) -> float + A function that computes the statistical error given a dataframe, the name of the treatment column and the + name of the outcome column. The error must be multiplied by a quantile to get the upper and lower bounds of a confidence interval. @@ -285,15 +343,14 @@ def effect_curves( if error_fn is not None: - effect_errors: np.ndarray = confidence_interval_curve( + effect_errors: np.ndarray = cumulative_statistical_error_curve( df=df, treatment=treatment, outcome=outcome, prediction=prediction, min_rows=min_rows, steps=steps, - error_fn=error_fn, - **kwargs, + error_fn=error_fn ) effect_curves_df = effect_curves_df.assign( From d70c110c1eb22909215f15e6996907510bc6ee63 Mon Sep 17 00:00:00 2001 From: Hector Lira Date: Thu, 11 May 2023 09:09:37 -0600 Subject: [PATCH 07/12] Adds unit tests --- tests/causal/validation/test_curves.py | 5 ++++- .../validation/test_statistical_errors.py | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 tests/causal/validation/test_statistical_errors.py diff --git a/tests/causal/validation/test_curves.py b/tests/causal/validation/test_curves.py index 98e6cc74..9fe40225 100644 --- a/tests/causal/validation/test_curves.py +++ b/tests/causal/validation/test_curves.py @@ -4,6 +4,7 @@ from fklearn.causal.effects import linear_effect from fklearn.causal.validation.curves import (effect_by_segment, cumulative_effect_curve, cumulative_gain_curve, relative_cumulative_gain_curve, effect_curves) +from fklearn.causal.validation.statistical_errors import linear_standard_error def test_effect_by_segment(): @@ -83,9 +84,11 @@ def test_effect_curves(): "cumulative_gain_curve": [1., 1.33333333, 1.62698413, 1.66666667, 1.94444444, 2.18803419, 2.], "random_model_cumulative_gain_curve": [0.6666666, 0.8888888, 1.1111111, 1.3333333, 1.5555555, 1.7777777, 2.], "relative_cumulative_gain_curve": [0.33333333, 0.44444444, 0.51587302, 0.33333333, 0.38888889, 0.41025641, 0.], + "cumulative_effect_curve_error": [0.0 , 0.0 , 0.30583887, 0.39528471, 0.32084447, 0.39055247, 0.48795004], + "cumulative_gain_curve_error": [0.0, 0.0, 0.16991048, 0.26352313, 0.24954570, 0.34715774, 0.48795003], }) result = effect_curves(df, prediction="x", outcome="y", treatment="t", min_rows=3, steps=df.shape[0], - effect_fn=linear_effect) + effect_fn=linear_effect, error_fn=linear_standard_error) pd.testing.assert_frame_equal(result, expected, atol=1e-07) diff --git a/tests/causal/validation/test_statistical_errors.py b/tests/causal/validation/test_statistical_errors.py new file mode 100644 index 00000000..3c4df073 --- /dev/null +++ b/tests/causal/validation/test_statistical_errors.py @@ -0,0 +1,18 @@ +import numpy as np +import pandas as pd + +from fklearn.causal.validation.statistical_errors import linear_standard_error + + +def test_linear_standard_error(): + + df = pd.DataFrame(dict( + t=[1, 1, 1, 2, 2, 2, 3, 3, 3], + x=[1, 2, 3, 1, 2, 3, 1, 2, 3], + y=[1, 1, 1, 2, 3, 4, 3, 5, 7], + )) + + result = linear_standard_error(df, treatment="t", outcome="y") + expected = 0.48795003647426655 + + np.testing.assert_array_almost_equal(result, expected, decimal=4) From c08fef8317d5d07087a69856b3422217b0d6bbfa Mon Sep 17 00:00:00 2001 From: Hector Lira Date: Thu, 11 May 2023 09:30:07 -0600 Subject: [PATCH 08/12] Fixes unit tests --- src/fklearn/causal/statistical_errors.py | 2 +- tests/causal/validation/test_curves.py | 2 +- tests/causal/validation/test_statistical_errors.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/fklearn/causal/statistical_errors.py b/src/fklearn/causal/statistical_errors.py index 6e707839..9e1726d9 100644 --- a/src/fklearn/causal/statistical_errors.py +++ b/src/fklearn/causal/statistical_errors.py @@ -2,7 +2,7 @@ import pandas as pd from fklearn.causal.effects import linear_effect -def linear_standard_error(df: pd.DatFrame, treatment: str, outcome: str): +def linear_standard_error(df: pd.DataFrame, treatment: str, outcome: str): """ Linear Standard Error diff --git a/tests/causal/validation/test_curves.py b/tests/causal/validation/test_curves.py index 9fe40225..1537739c 100644 --- a/tests/causal/validation/test_curves.py +++ b/tests/causal/validation/test_curves.py @@ -4,7 +4,7 @@ from fklearn.causal.effects import linear_effect from fklearn.causal.validation.curves import (effect_by_segment, cumulative_effect_curve, cumulative_gain_curve, relative_cumulative_gain_curve, effect_curves) -from fklearn.causal.validation.statistical_errors import linear_standard_error +from fklearn.causal.statistical_errors import linear_standard_error def test_effect_by_segment(): diff --git a/tests/causal/validation/test_statistical_errors.py b/tests/causal/validation/test_statistical_errors.py index 3c4df073..2429c5f4 100644 --- a/tests/causal/validation/test_statistical_errors.py +++ b/tests/causal/validation/test_statistical_errors.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd -from fklearn.causal.validation.statistical_errors import linear_standard_error +from fklearn.causal.statistical_errors import linear_standard_error def test_linear_standard_error(): From 553676218bd667fda1d270fb8154f063b864934f Mon Sep 17 00:00:00 2001 From: Hector Lira Date: Thu, 11 May 2023 09:46:14 -0600 Subject: [PATCH 09/12] Fixes code style --- docs/source/api/fklearn.causal.rst | 8 ++++++++ src/fklearn/causal/statistical_errors.py | 8 ++++---- src/fklearn/causal/validation/curves.py | 8 ++++---- tests/causal/validation/test_curves.py | 2 +- 4 files changed, 17 insertions(+), 9 deletions(-) diff --git a/docs/source/api/fklearn.causal.rst b/docs/source/api/fklearn.causal.rst index 88b6b7ab..9d67308b 100644 --- a/docs/source/api/fklearn.causal.rst +++ b/docs/source/api/fklearn.causal.rst @@ -27,6 +27,14 @@ fklearn.causal.effects module :undoc-members: :show-inheritance: +fklearn.causal.statistical_errors module +----------------------------- + +.. automodule:: fklearn.causal.statistical_errors + :members: + :undoc-members: + :show-inheritance: + Module contents --------------- diff --git a/src/fklearn/causal/statistical_errors.py b/src/fklearn/causal/statistical_errors.py index 9e1726d9..69d3a03b 100644 --- a/src/fklearn/causal/statistical_errors.py +++ b/src/fklearn/causal/statistical_errors.py @@ -2,7 +2,7 @@ import pandas as pd from fklearn.causal.effects import linear_effect -def linear_standard_error(df: pd.DataFrame, treatment: str, outcome: str): +def linear_standard_error(df: pd.DataFrame, treatment: str, outcome: str) -> float: """ Linear Standard Error @@ -31,6 +31,6 @@ def linear_standard_error(df: pd.DataFrame, treatment: str, outcome: str): t_bar = df[treatment].mean() beta1 = linear_effect(df, treatment, outcome) beta0 = df[outcome].mean() - beta1 * t_bar - e = df[outcome] - (beta0 + beta1*df[treatment]) - se = np.sqrt(((1/(n-2))*np.sum(e**2))/np.sum((df[treatment]-t_bar)**2)) - return se \ No newline at end of file + e = df[outcome] - (beta0 + beta1 * df[treatment]) + se = np.sqrt(((1 / (n - 2)) * np.sum(e**2)) / np.sum((df[treatment] - t_bar)**2)) + return se diff --git a/src/fklearn/causal/validation/curves.py b/src/fklearn/causal/validation/curves.py index aa59f447..14ae42b4 100644 --- a/src/fklearn/causal/validation/curves.py +++ b/src/fklearn/causal/validation/curves.py @@ -207,7 +207,6 @@ def relative_cumulative_gain_curve(df: pd.DataFrame, return np.array([(effect - ate) * (rows / size) for rows, effect in zip(n_rows, cum_effect)]) - def cumulative_statistical_error_curve( df: pd.DataFrame, treatment: str, @@ -256,9 +255,10 @@ def cumulative_statistical_error_curve( size = df.shape[0] ordered_df = df.sort_values(prediction, ascending=False).reset_index(drop=True) n_rows = list(range(min_rows, size, size // steps)) + [size] - + return np.array([error_fn(ordered_df.head(rows), treatment, outcome) for rows in n_rows]) + @curry def effect_curves( df: pd.DataFrame, @@ -269,7 +269,7 @@ def effect_curves( steps: int = 100, effect_fn: EffectFnType = linear_effect, error_fn: EffectErrorFnType = None, -) -> pd.DataFrame: + ) -> pd.DataFrame: """ Creates a dataset summarizing the effect curves: cumulative effect, cumulative gain and relative cumulative gain. The dataset also contains two columns referencing the data @@ -358,4 +358,4 @@ def effect_curves( cumulative_gain_curve_error=lambda x: x["samples_fraction"] * x["cumulative_effect_curve_error"], ) - return effect_curves_df \ No newline at end of file + return effect_curves_df diff --git a/tests/causal/validation/test_curves.py b/tests/causal/validation/test_curves.py index 1537739c..19ed373c 100644 --- a/tests/causal/validation/test_curves.py +++ b/tests/causal/validation/test_curves.py @@ -84,7 +84,7 @@ def test_effect_curves(): "cumulative_gain_curve": [1., 1.33333333, 1.62698413, 1.66666667, 1.94444444, 2.18803419, 2.], "random_model_cumulative_gain_curve": [0.6666666, 0.8888888, 1.1111111, 1.3333333, 1.5555555, 1.7777777, 2.], "relative_cumulative_gain_curve": [0.33333333, 0.44444444, 0.51587302, 0.33333333, 0.38888889, 0.41025641, 0.], - "cumulative_effect_curve_error": [0.0 , 0.0 , 0.30583887, 0.39528471, 0.32084447, 0.39055247, 0.48795004], + "cumulative_effect_curve_error": [0.0, 0.0, 0.30583887, 0.39528471, 0.32084447, 0.39055247, 0.48795004], "cumulative_gain_curve_error": [0.0, 0.0, 0.16991048, 0.26352313, 0.24954570, 0.34715774, 0.48795003], }) From 3db1f327cf4cab760ed7a6cbbcdef50414446774 Mon Sep 17 00:00:00 2001 From: Hector Lira Date: Thu, 11 May 2023 09:50:20 -0600 Subject: [PATCH 10/12] Fixes code style in statistical_errors.py --- src/fklearn/causal/statistical_errors.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/fklearn/causal/statistical_errors.py b/src/fklearn/causal/statistical_errors.py index 69d3a03b..863d07dc 100644 --- a/src/fklearn/causal/statistical_errors.py +++ b/src/fklearn/causal/statistical_errors.py @@ -2,11 +2,12 @@ import pandas as pd from fklearn.causal.effects import linear_effect + def linear_standard_error(df: pd.DataFrame, treatment: str, outcome: str) -> float: """ Linear Standard Error - Returns a Float: the linear standard error of a linear regression + Returns a Float: the linear standard error of a linear regression of the outcome as a function of the treatment. Parameters @@ -24,9 +25,10 @@ def linear_standard_error(df: pd.DataFrame, treatment: str, outcome: str) -> flo Returns ---------- se : Float - A Float of the linear standard error extracted by using the formula for + A Float of the linear standard error extracted by using the formula for the simple linear regression. """ + n = df.shape[0] t_bar = df[treatment].mean() beta1 = linear_effect(df, treatment, outcome) From d785591e0f4fae37db6f306037aab04118c33cf8 Mon Sep 17 00:00:00 2001 From: MarianaBlaz Date: Thu, 11 May 2023 09:54:07 -0600 Subject: [PATCH 11/12] corrects style in curves.py --- src/fklearn/causal/validation/curves.py | 36 ++++++++++++------------- src/fklearn/types/types.py | 2 +- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/src/fklearn/causal/validation/curves.py b/src/fklearn/causal/validation/curves.py index 14ae42b4..f6a10f04 100644 --- a/src/fklearn/causal/validation/curves.py +++ b/src/fklearn/causal/validation/curves.py @@ -207,17 +207,17 @@ def relative_cumulative_gain_curve(df: pd.DataFrame, return np.array([(effect - ate) * (rows / size) for rows, effect in zip(n_rows, cum_effect)]) -def cumulative_statistical_error_curve( - df: pd.DataFrame, - treatment: str, - outcome: str, - prediction: str, - min_rows: int = 30, - steps: int = 100, - error_fn: EffectFnType = linear_standard_error) -> np.ndarray: +def cumulative_statistical_error_curve(df: pd.DataFrame, + treatment: str, + outcome: str, + prediction: str, + min_rows: int = 30, + steps: int = 100, + error_fn: EffectFnType = linear_standard_error, + ) -> np.ndarray: """ - Orders the dataset by prediction and computes the cumulative error curve according + Orders the dataset by prediction and computes the cumulative error curve according to that ordering. The function to compute the error is given by error_fn. Parameters @@ -260,16 +260,14 @@ def cumulative_statistical_error_curve( @curry -def effect_curves( - df: pd.DataFrame, - treatment: str, - outcome: str, - prediction: str, - min_rows: int = 30, - steps: int = 100, - effect_fn: EffectFnType = linear_effect, - error_fn: EffectErrorFnType = None, - ) -> pd.DataFrame: +def effect_curves(df: pd.DataFrame, + treatment: str, + outcome: str, + prediction: str, + min_rows: int = 30, + steps: int = 100, + effect_fn: EffectFnType = linear_effect, + error_fn: EffectErrorFnType = None) -> pd.DataFrame: """ Creates a dataset summarizing the effect curves: cumulative effect, cumulative gain and relative cumulative gain. The dataset also contains two columns referencing the data diff --git a/src/fklearn/types/types.py b/src/fklearn/types/types.py index 4f775a90..4e55d535 100644 --- a/src/fklearn/types/types.py +++ b/src/fklearn/types/types.py @@ -43,4 +43,4 @@ EffectFnType = Callable[[pd.DataFrame, str, str], float] # Effect Error Functions -EffectErrorFnType = Callable[[pd.DataFrame, str, str], float] \ No newline at end of file +EffectErrorFnType = Callable[[pd.DataFrame, str, str], float] From e5b5d4d880ba1a2e163afadb1dfc30bad4f7bec8 Mon Sep 17 00:00:00 2001 From: MarianaBlaz Date: Thu, 11 May 2023 10:53:42 -0600 Subject: [PATCH 12/12] removes notebook used for testing confidence interval curves --- src/fklearn/causal/validation/ci_test.ipynb | 700 -------------------- 1 file changed, 700 deletions(-) delete mode 100644 src/fklearn/causal/validation/ci_test.ipynb diff --git a/src/fklearn/causal/validation/ci_test.ipynb b/src/fklearn/causal/validation/ci_test.ipynb deleted file mode 100644 index 60eff310..00000000 --- a/src/fklearn/causal/validation/ci_test.ipynb +++ /dev/null @@ -1,700 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "from fklearn.causal.effects import linear_effect\n", - "from fklearn.causal.validation.curves import cumulative_effect_curve" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from typing import Any, List\n", - "\n", - "from toolz import curry\n", - "from fklearn.types import EffectFnType" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.DataFrame(dict(\n", - " t=[1, 1, 1, 2, 2, 2, 3, 3, 3],\n", - " x=[1, 2, 3, 1, 2, 3, 1, 2, 3],\n", - " y=[1, 1, 1, 2, 3, 4, 3, 5, 7],\n", - "))\n", - "\n", - "result = cumulative_effect_curve(df, prediction=\"x\", outcome=\"y\", treatment=\"t\", min_rows=3, steps=df.shape[0],\n", - " effect_fn=linear_effect)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([3. , 3. , 2.92857143, 2.5 , 2.5 ,\n", - " 2.46153846, 2. ])" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "result" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "# def linear_ci(df, y, t, z=1.96):\n", - "# n = df.shape[0]\n", - "# t_bar = df[t].mean()\n", - "# beta1 = linear_effect(df, y, t)\n", - "# beta0 = df[y].mean() - beta1 * t_bar\n", - "# e = df[y] - (beta0 + beta1*df[t])\n", - "# se = np.sqrt(((1/(n-2))*np.sum(e**2))/np.sum((df[t]-t_bar)**2))\n", - "# return np.array([beta1 - z*se, beta1 + z*se])\n", - "\n", - "def linear_ci(df, t, y):\n", - " n = df.shape[0]\n", - " t_bar = df[t].mean()\n", - " beta1 = linear_effect(df, t, y)\n", - " beta0 = df[y].mean() - beta1 * t_bar\n", - " e = df[y] - (beta0 + beta1*df[t])\n", - " se = np.sqrt(((1/(n-2))*np.sum(e**2))/np.sum((df[t]-t_bar)**2))\n", - " return se" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.0" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linear_ci(ordered_df.head(3), \"t\", \"y\")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
txy
2131
5234
8337
\n", - "
" - ], - "text/plain": [ - " t x y\n", - "2 1 3 1\n", - "5 2 3 4\n", - "8 3 3 7" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ordered_df = df.sort_values(by=\"x\", ascending=False)\n", - "ordered_df.head(3)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "def confidence_interval_curve(\n", - " df: pd.DataFrame,\n", - " treatment: str,\n", - " outcome: str,\n", - " prediction: str,\n", - " min_rows: int = 30,\n", - " steps: int = 100,\n", - " error_fn: EffectFnType = linear_standard_error,\n", - " **kwargs,\n", - ") -> np.ndarray:\n", - " \n", - " size = df.shape[0]\n", - " ordered_df = df.sort_values(prediction, ascending=False).reset_index(drop=True)\n", - " n_rows = list(range(min_rows, size, size // steps)) + [size]\n", - " \n", - " return np.array([error_fn(ordered_df.head(rows), treatment, outcome, **kwargs) for rows in n_rows])" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "@curry\n", - "def effect_curves(\n", - " df: pd.DataFrame,\n", - " treatment: str,\n", - " outcome: str,\n", - " prediction: str,\n", - " min_rows: int = 30,\n", - " steps: int = 100,\n", - " effect_fn: EffectFnType = linear_effect,\n", - " error_fn: EffectFnType = None,\n", - " **kwargs,\n", - ") -> pd.DataFrame:\n", - "\n", - " size: int = df.shape[0]\n", - " n_rows: List[int] = list(range(min_rows, size, size // steps)) + [size]\n", - "\n", - " cum_effect: np.ndarray = cumulative_effect_curve(\n", - " df=df,\n", - " treatment=treatment,\n", - " outcome=outcome,\n", - " prediction=prediction,\n", - " min_rows=min_rows,\n", - " steps=steps,\n", - " effect_fn=effect_fn,\n", - " )\n", - " ate: float = cum_effect[-1]\n", - " \n", - " effect_curves = pd.DataFrame({\"samples_count\": n_rows, \"cumulative_effect_curve\": cum_effect}).assign(\n", - " samples_fraction=lambda x: x[\"samples_count\"] / size,\n", - " cumulative_gain_curve=lambda x: x[\"samples_fraction\"] * x[\"cumulative_effect_curve\"],\n", - " random_model_cumulative_gain_curve=lambda x: x[\"samples_fraction\"] * ate,\n", - " relative_cumulative_gain_curve=lambda x: (\n", - " x[\"samples_fraction\"] * x[\"cumulative_effect_curve\"] - x[\"random_model_cumulative_gain_curve\"]\n", - " ),\n", - " )\n", - " \n", - " if ci_fn is not None:\n", - " \n", - " effect_errors: np.ndarray = confidence_interval_curve(\n", - " df=df,\n", - " treatment=treatment,\n", - " outcome=outcome,\n", - " prediction=prediction,\n", - " min_rows=min_rows,\n", - " steps=steps,\n", - " error_fn=error_fn,\n", - " **kwargs,\n", - " )\n", - " \n", - " effect_curves = effect_curves.assign(\n", - " cumulative_effect_curve_error=effect_errors,\n", - " cumulative_gain_curve_error=lambda x: x[\"samples_fraction\"] * x[\"cumulative_effect_curve_error\"],\n", - " )\n", - "\n", - " return effect_curves\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.DataFrame(dict(\n", - " t=[1, 1, 1, 2, 2, 2, 3, 3, 3],\n", - " x=[1, 2, 3, 1, 2, 3, 1, 2, 3],\n", - " y=[1, 1, 1, 2, 3, 4, 3, 5, 7],\n", - "))" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
samples_countcumulative_effect_curvesamples_fractioncumulative_gain_curverandom_model_cumulative_gain_curverelative_cumulative_gain_curvecumulative_effect_curve_errorcumulative_gain_curve_error
033.0000000.3333331.0000000.6666670.3333330.0000000.000000
143.0000000.4444441.3333330.8888890.4444440.0000000.000000
252.9285710.5555561.6269841.1111110.5158730.5994440.333025
362.5000000.6666671.6666671.3333330.3333330.7747580.516505
472.5000000.7777781.9444441.5555560.3888890.6288550.489110
582.4615380.8888892.1880341.7777780.4102560.7654830.680429
692.0000001.0000002.0000002.0000000.0000000.9563820.956382
\n", - "
" - ], - "text/plain": [ - " samples_count cumulative_effect_curve samples_fraction \\\n", - "0 3 3.000000 0.333333 \n", - "1 4 3.000000 0.444444 \n", - "2 5 2.928571 0.555556 \n", - "3 6 2.500000 0.666667 \n", - "4 7 2.500000 0.777778 \n", - "5 8 2.461538 0.888889 \n", - "6 9 2.000000 1.000000 \n", - "\n", - " cumulative_gain_curve random_model_cumulative_gain_curve \\\n", - "0 1.000000 0.666667 \n", - "1 1.333333 0.888889 \n", - "2 1.626984 1.111111 \n", - "3 1.666667 1.333333 \n", - "4 1.944444 1.555556 \n", - "5 2.188034 1.777778 \n", - "6 2.000000 2.000000 \n", - "\n", - " relative_cumulative_gain_curve cumulative_effect_curve_error \\\n", - "0 0.333333 0.000000 \n", - "1 0.444444 0.000000 \n", - "2 0.515873 0.599444 \n", - "3 0.333333 0.774758 \n", - "4 0.388889 0.628855 \n", - "5 0.410256 0.765483 \n", - "6 0.000000 0.956382 \n", - "\n", - " cumulative_gain_curve_error \n", - "0 0.000000 \n", - "1 0.000000 \n", - "2 0.333025 \n", - "3 0.516505 \n", - "4 0.489110 \n", - "5 0.680429 \n", - "6 0.956382 " - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "effect_curves(\n", - " df=df,\n", - " treatment=\"t\",\n", - " outcome=\"y\",\n", - " prediction=\"x\",\n", - " min_rows = 3,\n", - " steps = df.shape[0],\n", - " effect_fn = linear_effect,\n", - " ci_fn = linear_ci,\n", - " z = 1.96\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
txy
0111
3212
6313
1121
4223
7325
2131
5234
8337
\n", - "
" - ], - "text/plain": [ - " t x y\n", - "0 1 1 1\n", - "3 2 1 2\n", - "6 3 1 3\n", - "1 1 2 1\n", - "4 2 2 3\n", - "7 3 2 5\n", - "2 1 3 1\n", - "5 2 3 4\n", - "8 3 3 7" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.sort_values(by=\"x\", ascending=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([0. , 0. , 0.59944419, 0.77475803, 0.62885517,\n", - " 0.76548284, 0.95638207])" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cumulative_elast_curve_ci(\n", - " dataset=df,\n", - " prediction=\"x\",\n", - " y=\"y\",\n", - " t=\"t\",\n", - " min_periods=3,\n", - " steps=df.shape[0]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.35294117647058826" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "linear_effect(df, \"y\", \"t\")" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "def elast_ci(df, y, t, z=1.96):\n", - " n = df.shape[0]\n", - " t_bar = df[t].mean()\n", - "# beta1 = elast(df, y, t)\n", - " beta1 = linear_effect(df, t, y)\n", - " beta0 = df[y].mean() - beta1 * t_bar\n", - " e = df[y] - (beta0 + beta1*df[t])\n", - " se = np.sqrt(((1/(n-2))*np.sum(e**2))/np.sum((df[t]-t_bar)**2))\n", - "# return np.array([beta1 - z*se, beta1 + z*se])\n", - " return z*se\n", - "\n", - "@curry\n", - "def elast(data, y, t):\n", - " # line coeficient for the one variable linear regression\n", - " return (np.sum((data[t] - data[t].mean())*(data[y] - data[y].mean())) /\n", - " np.sum((data[t] - data[t].mean())**2))" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "def cumulative_elast_curve_ci(dataset, prediction, y, t, min_periods=30, steps=100):\n", - " size = dataset.shape[0]\n", - " ordered_df = dataset.sort_values(prediction, ascending=False).reset_index(drop=True)\n", - " n_rows = list(range(min_periods, size, size // steps)) + [size]\n", - " \n", - " # just replacing a call to `elast` by a call to `elast_ci`\n", - " return np.array([elast_ci(ordered_df.head(rows), y, t) for rows in n_rows])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# siguientes tasks:\n", - "# - Agregar nuevos argumentos a las funciones y documentación de las funciones\n", - "# - Modificar cumulative effect curves con nuevos cambios\n", - "# - Crear archivo aparte de \"confidence intervals/errors\" (effects.py) y ahí poner linear_ci\n", - "# - Crear archivo aparte de \"curves\" (curves.py) y ahí poner confidence_interval_curve\n", - "# - Crear un nuevo tipo de variable (?, ErrorFnType) con el mismo signature de EffectFnType\n", - "# - Agregar tests con los ejemplos de este notebook\n", - "# - Modificar índice de la documentación\n", - "# - Abrir PR\n", - "# - agregar comentarios:\n", - "# 1. discusión de los tipos de las funciones: tienen el mismo signature, podemos pensar en algo más genérico?\n", - "# 2. confidence_interval_curve y cumulative_effect_curve hacen lo mismo, podemos pensar en algo más genérico?\n", - "# 3. cómo atacar la vulnerabilidad de que las curvas y los errores son arreglos que tienen que ser del mismo tamaño.\n", - "# Ahorita lo estamos hard-coding pero existirá alguna mejor manera de lidiar con esto?" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}