From 5c058018080e4bc5802f1ebd7769d3e3dc55a8fb Mon Sep 17 00:00:00 2001 From: Aryan Keluskar Date: Sat, 22 Jun 2024 02:31:17 +0530 Subject: [PATCH] Replace Native Welch t-test with statsmodels (#2049) * fix deprecated convert_dtype parameter * fix deprecated behavior of setting an incompatible dtype * fix numpy array conversion deprecation * replaced native welch ttest with statsmodels * updated requirements * remove unused import * added documentation for new code * updated requirements * updated requirements for pypi --- ci/aarch64.conda_requirements.txt | 1 + ci/conda_requirements.txt | 1 + ci/requirements.doc.txt | 1 + setup.py | 1 + skbio/stats/composition.py | 41 ++++++++++++++++++++----------- 5 files changed, 30 insertions(+), 15 deletions(-) diff --git a/ci/aarch64.conda_requirements.txt b/ci/aarch64.conda_requirements.txt index 51c73451f..38887e32d 100644 --- a/ci/aarch64.conda_requirements.txt +++ b/ci/aarch64.conda_requirements.txt @@ -5,3 +5,4 @@ numpy >= 1.17.0 pandas >= 1.5.0 scipy >= 1.9.0 h5py >= 3.6.0 +statsmodels >= 0.14.0 \ No newline at end of file diff --git a/ci/conda_requirements.txt b/ci/conda_requirements.txt index fc36dea2e..4c349f484 100644 --- a/ci/conda_requirements.txt +++ b/ci/conda_requirements.txt @@ -6,3 +6,4 @@ pandas >= 1.5.0 scipy >= 1.9.0 h5py >= 3.6.0 biom-format >= 2.1.16 +statsmodels >= 0.14.0 \ No newline at end of file diff --git a/ci/requirements.doc.txt b/ci/requirements.doc.txt index bd8119df9..18d41190e 100644 --- a/ci/requirements.doc.txt +++ b/ci/requirements.doc.txt @@ -5,3 +5,4 @@ pydata-sphinx-theme sphinxcontrib-youtube numpydoc matplotlib +statsmodels \ No newline at end of file diff --git a/setup.py b/setup.py index 81e65719f..8de6093b4 100644 --- a/setup.py +++ b/setup.py @@ -227,6 +227,7 @@ def check_bin(ccbin, source, allow_dash): "scipy >= 1.9.0", "h5py >= 3.6.0", "biom-format >= 2.1.16", + "statsmodels >= 0.14.0", ], classifiers=classifiers, package_data={ diff --git a/skbio/stats/composition.py b/skbio/stats/composition.py index 9d799a4ec..3136260ea 100644 --- a/skbio/stats/composition.py +++ b/skbio/stats/composition.py @@ -88,6 +88,7 @@ import scipy.stats from scipy.sparse import coo_matrix from scipy.stats import t, gmean +from statsmodels.stats.weightstats import CompareMeans from skbio.stats.distance import DistanceMatrix from skbio.util import find_duplicates @@ -1787,36 +1788,46 @@ def _welch_ttest(a, b): See Also -------- scipy.stats.ttest_ind + statsmodels.stats.weightstats.CompareMeans Notes ----- Compared with ``scipy.stats.ttest_ind`` with ``equal_var=False``, this - function additionally returns confidence intervals. + function additionally returns confidence intervals. This implementation + uses the ``CompareMeans`` class from ``statsmodels.stats.weightstats``. """ # See https://stats.stackexchange.com/a/475345 - n1 = len(a) - n2 = len(b) + # See https://www.statsmodels.org/dev/generated/statsmodels.stats.weightstats.CompareMeans.html + + # Creating a CompareMeans object to perform Welch's t-test + statsmodel_cm_object = CompareMeans.from_data( + data1=a, data2=b, weights1=None, weights2=None + ) + + # Performing Welch's t-test using the object to obtain tstat, pvalue, and df + ttest_cm_result = statsmodel_cm_object.ttest_ind( + alternative="two-sided", usevar="unequal", value=0 + ) + + tstat = ttest_cm_result[0] + p = ttest_cm_result[1] + df = ttest_cm_result[2] + + # Calculating difference between the two means m1 = np.mean(a) m2 = np.mean(b) - v1 = np.var(a, ddof=1) - v2 = np.var(b, ddof=1) - - pooled_se = np.sqrt(v1 / n1 + v2 / n2) delta = m1 - m2 - tstat = delta / pooled_se - df = (v1 / n1 + v2 / n2) ** 2 / ( - v1**2 / (n1**2 * (n1 - 1)) + v2**2 / (n2**2 * (n2 - 1)) + # Calculating confidence intervals using the aformentioned CompareMeans object + conf_int = statsmodel_cm_object.tconfint_diff( + alpha=0.05, alternative="two-sided", usevar="unequal" ) - # two-sided t-test - p = 2 * t.cdf(-abs(tstat), df) + lb = conf_int[0] + ub = conf_int[1] - # upper and lower bounds - lb = delta - t.ppf(0.975, df) * pooled_se - ub = delta + t.ppf(0.975, df) * pooled_se return pd.DataFrame( np.array([tstat, df, p, delta, lb, ub]).reshape(1, -1), columns=["T statistic", "df", "pvalue", "Difference", "CI(2.5)", "CI(97.5)"],