Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add confidence interval causal curves #231

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
8 changes: 8 additions & 0 deletions docs/source/api/fklearn.causal.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,14 @@ fklearn.causal.effects module
:undoc-members:
:show-inheritance:

fklearn.causal.statistical_errors module
-----------------------------

.. automodule:: fklearn.causal.statistical_errors
:members:
:undoc-members:
:show-inheritance:


Module contents
---------------
Expand Down
38 changes: 38 additions & 0 deletions src/fklearn/causal/statistical_errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import numpy as np
import pandas as pd
from fklearn.causal.effects import linear_effect


def linear_standard_error(df: pd.DataFrame, treatment: str, outcome: str) -> float:
"""
Linear Standard Error

Returns a Float: the linear standard error of a linear regression
of the outcome as a function of the treatment.

Parameters
----------

df : Pandas DataFrame
A Pandas' DataFrame with with treatment, outcome and confounder columns

treatment : str
The name of the column in `df` with the treatment.

outcome : str
The name of the column in `df` with the outcome.

Returns
----------
se : Float
A Float of the linear standard error extracted by using the formula for
the simple linear regression.
"""

n = df.shape[0]
t_bar = df[treatment].mean()
beta1 = linear_effect(df, treatment, outcome)
beta0 = df[outcome].mean() - beta1 * t_bar
e = df[outcome] - (beta0 + beta1 * df[treatment])
se = np.sqrt(((1 / (n - 2)) * np.sum(e**2)) / np.sum((df[treatment] - t_bar)**2))
return se
103 changes: 92 additions & 11 deletions src/fklearn/causal/validation/curves.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
import pandas as pd
from toolz import curry, partial

from fklearn.types import EffectFnType
from fklearn.types import EffectErrorFnType, EffectFnType
from fklearn.causal.effects import linear_effect
from fklearn.causal.statistical_errors import linear_standard_error


@curry
Expand Down Expand Up @@ -206,23 +207,79 @@ def relative_cumulative_gain_curve(df: pd.DataFrame,
return np.array([(effect - ate) * (rows / size) for rows, effect in zip(n_rows, cum_effect)])


def cumulative_statistical_error_curve(df: pd.DataFrame,
treatment: str,
outcome: str,
prediction: str,
min_rows: int = 30,
steps: int = 100,
error_fn: EffectFnType = linear_standard_error,
) -> np.ndarray:

"""
Orders the dataset by prediction and computes the cumulative error curve according
to that ordering. The function to compute the error is given by error_fn.

Parameters
----------
df : Pandas' DataFrame
A Pandas' DataFrame with target and prediction scores.

treatment : Strings
The name of the treatment column in `df`.

outcome : Strings
The name of the outcome column in `df`.

prediction : Strings
The name of the prediction column in `df`.

min_rows : Integer
Minimum number of observations needed to have a valid result.

steps : Integer
The number of cumulative steps to iterate when accumulating the effect

error_fn : function (df: pandas.DataFrame, treatment: str, outcome: str) -> float
A function that computes the statistical error of the regression of the treatment effect
over the outcome given a dataframe, the name of the treatment column and the name
of the outcome column.


Returns
----------
cumulative statistical error curve: Numpy's Array
The cumulative error according to the predictions ordering.
"""

size = df.shape[0]
ordered_df = df.sort_values(prediction, ascending=False).reset_index(drop=True)
n_rows = list(range(min_rows, size, size // steps)) + [size]

return np.array([error_fn(ordered_df.head(rows), treatment, outcome) for rows in n_rows])


@curry
def effect_curves(
df: pd.DataFrame,
treatment: str,
outcome: str,
prediction: str,
min_rows: int = 30,
steps: int = 100,
effect_fn: EffectFnType = linear_effect,
) -> pd.DataFrame:
def effect_curves(df: pd.DataFrame,
treatment: str,
outcome: str,
prediction: str,
min_rows: int = 30,
steps: int = 100,
effect_fn: EffectFnType = linear_effect,
error_fn: EffectErrorFnType = None) -> pd.DataFrame:
"""
Creates a dataset summarizing the effect curves: cumulative effect, cumulative gain and
relative cumulative gain. The dataset also contains two columns referencing the data
used to compute the curves at each step: number of samples and fraction of samples used.
Moreover one column indicating the cumulative gain for a corresponding random model is
also included as a benchmark.

It is also possible to include a cumulative error function by passing an error_fn, this
column is useful to include a confidence interval, which can be achieved by multiplying the
error column by a desired quantile.


Parameters
----------
df : Pandas' DataFrame
Expand All @@ -247,6 +304,11 @@ def effect_curves(
A function that computes the treatment effect given a dataframe, the name of the treatment column and the name
of the outcome column.

error_fn : function (df: pandas.DataFrame, treatment: str, outcome: str) -> float
A function that computes the statistical error given a dataframe, the name of the treatment column and the
name of the outcome column. The error must be multiplied by a quantile to get the upper and lower bounds of
a confidence interval.


Returns
----------
Expand All @@ -268,11 +330,30 @@ def effect_curves(
)
ate: float = cum_effect[-1]

return pd.DataFrame({"samples_count": n_rows, "cumulative_effect_curve": cum_effect}).assign(
effect_curves_df = pd.DataFrame({"samples_count": n_rows, "cumulative_effect_curve": cum_effect}).assign(
samples_fraction=lambda x: x["samples_count"] / size,
cumulative_gain_curve=lambda x: x["samples_fraction"] * x["cumulative_effect_curve"],
random_model_cumulative_gain_curve=lambda x: x["samples_fraction"] * ate,
relative_cumulative_gain_curve=lambda x: (
x["samples_fraction"] * x["cumulative_effect_curve"] - x["random_model_cumulative_gain_curve"]
),
)

if error_fn is not None:

effect_errors: np.ndarray = cumulative_statistical_error_curve(
df=df,
treatment=treatment,
outcome=outcome,
prediction=prediction,
min_rows=min_rows,
steps=steps,
error_fn=error_fn
)

effect_curves_df = effect_curves_df.assign(
cumulative_effect_curve_error=effect_errors,
cumulative_gain_curve_error=lambda x: x["samples_fraction"] * x["cumulative_effect_curve_error"],
)

return effect_curves_df
3 changes: 3 additions & 0 deletions src/fklearn/types/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,6 @@

# Effect Functions
EffectFnType = Callable[[pd.DataFrame, str, str], float]

# Effect Error Functions
EffectErrorFnType = Callable[[pd.DataFrame, str, str], float]
5 changes: 4 additions & 1 deletion tests/causal/validation/test_curves.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from fklearn.causal.effects import linear_effect
from fklearn.causal.validation.curves import (effect_by_segment, cumulative_effect_curve, cumulative_gain_curve,
relative_cumulative_gain_curve, effect_curves)
from fklearn.causal.statistical_errors import linear_standard_error


def test_effect_by_segment():
Expand Down Expand Up @@ -83,9 +84,11 @@ def test_effect_curves():
"cumulative_gain_curve": [1., 1.33333333, 1.62698413, 1.66666667, 1.94444444, 2.18803419, 2.],
"random_model_cumulative_gain_curve": [0.6666666, 0.8888888, 1.1111111, 1.3333333, 1.5555555, 1.7777777, 2.],
"relative_cumulative_gain_curve": [0.33333333, 0.44444444, 0.51587302, 0.33333333, 0.38888889, 0.41025641, 0.],
"cumulative_effect_curve_error": [0.0, 0.0, 0.30583887, 0.39528471, 0.32084447, 0.39055247, 0.48795004],
"cumulative_gain_curve_error": [0.0, 0.0, 0.16991048, 0.26352313, 0.24954570, 0.34715774, 0.48795003],
})

result = effect_curves(df, prediction="x", outcome="y", treatment="t", min_rows=3, steps=df.shape[0],
effect_fn=linear_effect)
effect_fn=linear_effect, error_fn=linear_standard_error)

pd.testing.assert_frame_equal(result, expected, atol=1e-07)
18 changes: 18 additions & 0 deletions tests/causal/validation/test_statistical_errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import numpy as np
import pandas as pd

from fklearn.causal.statistical_errors import linear_standard_error


def test_linear_standard_error():

df = pd.DataFrame(dict(
t=[1, 1, 1, 2, 2, 2, 3, 3, 3],
x=[1, 2, 3, 1, 2, 3, 1, 2, 3],
y=[1, 1, 1, 2, 3, 4, 3, 5, 7],
))

result = linear_standard_error(df, treatment="t", outcome="y")
expected = 0.48795003647426655

np.testing.assert_array_almost_equal(result, expected, decimal=4)