From e884a19ba3a42c0267ce4c6f56bb37bdfda07555 Mon Sep 17 00:00:00 2001 From: Jon Luo <20971593+jzluo@users.noreply.github.com> Date: Fri, 29 Jul 2022 17:36:32 -0400 Subject: [PATCH 1/3] add endometrial cancer dataset --- .flake8 | 1 + firthlogist/__init__.py | 6 ++- firthlogist/datasets/endometrial.csv | 80 ++++++++++++++++++++++++++++ firthlogist/firthlogist.py | 31 +++++++++++ 4 files changed, 117 insertions(+), 1 deletion(-) create mode 100644 firthlogist/datasets/endometrial.csv diff --git a/.flake8 b/.flake8 index 8dd399a..a3a2593 100644 --- a/.flake8 +++ b/.flake8 @@ -1,3 +1,4 @@ [flake8] max-line-length = 88 extend-ignore = E203 +per-file-ignores = firthlogist/__init__.py:F401 diff --git a/firthlogist/__init__.py b/firthlogist/__init__.py index c28fe0c..e7fed8e 100644 --- a/firthlogist/__init__.py +++ b/firthlogist/__init__.py @@ -1 +1,5 @@ -from .firthlogist import FirthLogisticRegression, load_sex2 # noqa F401 +from .firthlogist import ( # noqa F401 + FirthLogisticRegression, + load_endometrial, + load_sex2, +) diff --git a/firthlogist/datasets/endometrial.csv b/firthlogist/datasets/endometrial.csv new file mode 100644 index 0000000..18c5b9e --- /dev/null +++ b/firthlogist/datasets/endometrial.csv @@ -0,0 +1,80 @@ +"NV","PI","EH","HG" +0,13,1.64,0 +0,16,2.26,0 +0,8,3.14,0 +0,34,2.68,0 +0,20,1.28,0 +0,5,2.31,0 +0,17,1.8,0 +0,10,1.68,0 +0,26,1.56,0 +0,17,2.31,0 +0,8,2.01,0 +0,7,1.89,0 +0,20,3.15,0 +0,10,1.23,0 +0,18,1.27,0 +0,16,1.76,0 +0,18,2,0 +0,8,2.64,1 +0,29,0.88,1 +0,12,1.27,1 +0,20,1.37,1 +1,38,0.97,1 +1,22,1.14,1 +1,7,0.88,1 +1,25,0.91,1 +1,15,0.58,1 +0,7,0.97,1 +0,28,1.5,0 +0,11,1.33,0 +0,19,2.37,0 +0,10,1.82,0 +0,10,3.13,0 +0,18,1.31,0 +0,14,1.92,0 +0,21,1.64,0 +0,11,2.01,0 +0,17,1.88,0 +0,25,1.93,0 +0,16,2.11,0 +0,19,1.29,0 +0,15,1.72,0 +0,33,0.75,0 +0,24,1.92,0 +0,48,1.84,1 +0,12,1.11,1 +0,19,1.61,1 +0,2,1.18,1 +1,22,1.44,1 +1,40,1.18,1 +1,5,0.93,1 +1,0,1.17,1 +0,21,1.19,1 +0,15,1.06,1 +0,29,2.02,0 +0,15,2.29,0 +0,12,2.33,0 +0,3,2.9,0 +0,20,1.7,0 +0,23,1.41,0 +0,12,2.25,0 +0,22,1.54,0 +0,42,1.97,0 +0,15,1.75,0 +0,13,2.16,0 +0,14,2.57,0 +0,19,1.37,0 +0,12,3.61,0 +0,13,2.04,0 +0,10,2.17,0 +0,12,1.69,1 +1,49,0.27,1 +0,6,1.84,1 +0,5,1.3,1 +0,17,0.96,1 +1,11,1.01,1 +1,21,0.98,1 +0,5,0.35,1 +1,19,1.02,1 +0,33,0.85,1 diff --git a/firthlogist/firthlogist.py b/firthlogist/firthlogist.py index beb0c7a..1de5968 100644 --- a/firthlogist/firthlogist.py +++ b/firthlogist/firthlogist.py @@ -444,3 +444,34 @@ def load_sex2(): X = X[:, 1:] feature_names = ["age", "oc", "vic", "vicl", "vis", "dia"] return X, y, feature_names + + +def load_endometrial(): + """ + Load the endometrial cancer dataset analyzed in Heinze and Schemper (2002). The data + was originally provided by Dr E. Asseryanis from the Vienna University Medical + School + + Returns + ------- + X + endometrial data as numpy array + y + endometrial `HG` target column + feature_names + List of feature names + + References + ---------- + Agresti, A (2015). Foundations of Linear and Generalized Linear Models. + Wiley Series in Probability and Statistics. + + Heinze G, Schemper M (2002). A solution to the problem of separation in logistic + regression. Statistics in Medicine 21: 2409-2419. + """ + with open_text("firthlogist.datasets", "endometrial.csv") as sex2: + X = np.loadtxt(sex2, skiprows=1, delimiter=",") + y = X[:, -1] + X = X[:, :-1] + feature_names = ["NV", "PI", "EH"] + return X, y, feature_names From 51324110207b2493646b0bdecd936fd77342faae Mon Sep 17 00:00:00 2001 From: Jon Luo <20971593+jzluo@users.noreply.github.com> Date: Fri, 29 Jul 2022 18:04:11 -0400 Subject: [PATCH 2/3] disable step-halving by default, like in logistf 1.24.1 --- firthlogist/firthlogist.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/firthlogist/firthlogist.py b/firthlogist/firthlogist.py index 1de5968..ff81b63 100644 --- a/firthlogist/firthlogist.py +++ b/firthlogist/firthlogist.py @@ -83,10 +83,10 @@ class FirthLogisticRegression(BaseEstimator, ClassifierMixin): def __init__( self, max_iter=25, - max_halfstep=25, + max_halfstep=0, max_stepsize=5, pl_max_iter=100, - pl_max_halfstep=25, + pl_max_halfstep=0, pl_max_stepsize=5, tol=0.0001, fit_intercept=True, From 6a075ec574821244ebe20746430a4bd24ec0d504 Mon Sep 17 00:00:00 2001 From: Jon Luo <20971593+jzluo@users.noreply.github.com> Date: Fri, 29 Jul 2022 18:05:13 -0400 Subject: [PATCH 3/3] bump version to 0.3.1 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 4c1a9e2..d266799 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "firthlogist" -version = "0.3.0" +version = "0.3.1" description = "Python implementation of Logistic Regression with Firth's bias reduction" authors = ["Jon Luo "] repository = "https://github.com/jzluo/firthlogist"