Skip to content

Commit

Permalink
Merge pull request #11 from jzluo/10-wald-test-and-confidence-intervals
Browse files Browse the repository at this point in the history
10 wald test and confidence intervals
  • Loading branch information
jzluo authored Aug 2, 2022
2 parents 89bcce4 + b68efd7 commit f27c2d0
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 41 deletions.
10 changes: 9 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,19 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

## [0.4.0] - 2022-08-01

### Added
- Option to use Wald method for computing p-values and confidence intervals instead of LRT and profile likelihood. Set `wald=True` to use ([#11](https://github.com/jzluo/firthlogist/pull/11)).
- Tests for `load_sex2()` and `load_endometrial()` ([#9](https://github.com/jzluo/firthlogist/pull/9)).
- Test for profile likelihood confidence intervals ([#9](https://github.com/jzluo/firthlogist/pull/9)).
### Changed
- `skip_lrt` option is now `skip_pvals` ([#11](https://github.com/jzluo/firthlogist/pull/11)).
### Fixed
- `.summary()` no longer breaks if skipping confidence interval or p-value calculation ([#11](https://github.com/jzluo/firthlogist/pull/11)).
### Removed
- Diabetes and sex2 csv files removed from testing dir ([#9](([#9](https://github.com/jzluo/firthlogist/pull/9))).
- Diabetes and sex2 csv files removed from testing dir ([#9](https://github.com/jzluo/firthlogist/pull/9)).

## [0.3.1] - 2022-07-29
### Added
Expand Down
11 changes: 9 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,16 +70,23 @@ be less than max_stepsize.

 Specifies if intercept should be added.

`skip_lrt`: **_bool_, default=False**
`skip_pvals`: **_bool_, default=False**

 If True, p-values will not be calculated. Calculating the p-values can
be expensive since the fitting procedure is repeated for each
be expensive if `wald=False` since the fitting procedure is repeated for each
coefficient.

`skip_ci`: **_bool_, default=False**

 If True, confidence intervals will not be calculated. Calculating the confidence intervals via profile likelihoood is time-consuming.

`alpha`: **_float_, default=0.05**

 Significance level (confidence interval = 1-alpha). 0.05 as default for 95% CI.

`wald`: **_bool_, default=False**

 If True, uses Wald method to calculate p-values and confidence intervals.

### Attributes
`bse_`
Expand Down
101 changes: 64 additions & 37 deletions firthlogist/firthlogist.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import numpy as np
from scipy.linalg import lapack
from scipy.special import expit
from scipy.stats import chi2
from scipy.stats import chi2, norm
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.exceptions import ConvergenceWarning
from sklearn.preprocessing import LabelEncoder
Expand Down Expand Up @@ -43,14 +43,17 @@ class FirthLogisticRegression(BaseEstimator, ClassifierMixin):
Convergence tolerance for stopping.
fit_intercept
Specifies if intercept should be added.
skip_lrt
skip_pvals
If True, p-values will not be calculated. Calculating the p-values can be
time-consuming since the fitting procedure is repeated for each coefficient.
time-consuming if `wald=False` since the fitting procedure is repeated for each
coefficient.
skip_ci
If True, confidence intervals will not be calculated. Calculating the confidence
intervals via profile likelihoood is time-consuming.
alpha
Significance level (confidence interval = 1-alpha). 0.05 as default for 95% CI.
wald
If True, uses Wald method to calculate p-values and confidence intervals.
Attributes
----------
Expand Down Expand Up @@ -90,9 +93,10 @@ def __init__(
pl_max_stepsize=5,
tol=0.0001,
fit_intercept=True,
skip_lrt=False,
skip_pvals=False,
skip_ci=False,
alpha=0.05,
wald=False,
):
self.max_iter = max_iter
self.max_stepsize = max_stepsize
Expand All @@ -102,9 +106,10 @@ def __init__(
self.pl_max_stepsize = pl_max_stepsize
self.tol = tol
self.fit_intercept = fit_intercept
self.skip_lrt = skip_lrt
self.skip_pvals = skip_pvals
self.skip_ci = skip_ci
self.alpha = alpha
self.wald = wald

def _more_tags(self):
return {"binary_only": True}
Expand Down Expand Up @@ -146,40 +151,51 @@ def fit(self, X, y):
self.bse_ = _bse(X, self.coef_)

if not self.skip_ci:
self.ci_ = np.column_stack(
[
_profile_likelihood_ci(
X=X,
y=y,
side=side,
fitted_coef=self.coef_,
full_loglik=self.loglik_,
max_iter=self.pl_max_iter,
max_stepsize=self.pl_max_stepsize,
max_halfstep=self.pl_max_halfstep,
tol=self.tol,
alpha=0.05,
)
for side in [-1, 1]
]
)
if not self.wald:
self.ci_ = np.column_stack(
[
_profile_likelihood_ci(
X=X,
y=y,
side=side,
fitted_coef=self.coef_,
full_loglik=self.loglik_,
max_iter=self.pl_max_iter,
max_stepsize=self.pl_max_stepsize,
max_halfstep=self.pl_max_halfstep,
tol=self.tol,
alpha=0.05,
)
for side in [-1, 1]
]
)
else:
self.ci_ = _wald_ci(self.coef_, self.bse_, self.alpha)
else:
self.ci_ = np.full((self.coef_.shape[0], 2), np.nan)

# penalized likelihood ratio tests
if not self.skip_lrt:
pvals = []
# mask is 1-indexed because of `if mask` check in _get_XW()
for mask in range(1, self.coef_.shape[0] + 1):
_, null_loglik, _ = _firth_newton_raphson(
X,
y,
self.max_iter,
self.max_stepsize,
self.max_halfstep,
self.tol,
mask,
)
pvals.append(_lrt(self.loglik_, null_loglik))
self.pvals_ = np.array(pvals)
if not self.skip_pvals:
if not self.wald:
pvals = []
# mask is 1-indexed because of `if mask` check in _get_XW()
for mask in range(1, self.coef_.shape[0] + 1):
_, null_loglik, _ = _firth_newton_raphson(
X,
y,
self.max_iter,
self.max_stepsize,
self.max_halfstep,
self.tol,
mask,
)
pvals.append(_lrt(self.loglik_, null_loglik))
self.pvals_ = np.array(pvals)

else:
self.pvals_ = _wald_test(self.coef_, self.bse_)
else:
self.pvals_ = np.full(self.coef_.shape[0], np.nan)

if self.fit_intercept:
self.intercept_ = self.coef_[-1]
Expand Down Expand Up @@ -421,6 +437,17 @@ def _profile_likelihood_ci(
return ci


def _wald_ci(coef, bse, alpha):
lower_ci = coef + norm.ppf(alpha / 2) * bse
upper_ci = coef + norm.ppf(1 - alpha / 2) * bse
return np.column_stack([lower_ci, upper_ci])


def _wald_test(coef, bse):
# 1 - pchisq((beta^2/vars), 1), in our case bse = vars^0.5
return chi2.sf(np.square(coef) / np.square(bse), 1)


def load_sex2():
"""
Load the sex2 dataset from `logistf`.
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "firthlogist"
version = "0.3.1"
version = "0.4.0"
description = "Python implementation of Logistic Regression with Firth's bias reduction"
authors = ["Jon Luo <[email protected]>"]
repository = "https://github.com/jzluo/firthlogist"
Expand Down

0 comments on commit f27c2d0

Please sign in to comment.