From 5c058018080e4bc5802f1ebd7769d3e3dc55a8fb Mon Sep 17 00:00:00 2001
From: Aryan Keluskar <aryankeluskar6@gmail.com>
Date: Sat, 22 Jun 2024 02:31:17 +0530
Subject: [PATCH] Replace Native Welch t-test with statsmodels (#2049)

* fix deprecated convert_dtype parameter

* fix deprecated behavior of setting an incompatible dtype

* fix numpy array conversion deprecation

* replaced native welch ttest with statsmodels

* updated requirements

* remove unused import

* added documentation for new code

* updated requirements

* updated requirements for pypi
---
 ci/aarch64.conda_requirements.txt |  1 +
 ci/conda_requirements.txt         |  1 +
 ci/requirements.doc.txt           |  1 +
 setup.py                          |  1 +
 skbio/stats/composition.py        | 41 ++++++++++++++++++++-----------
 5 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/ci/aarch64.conda_requirements.txt b/ci/aarch64.conda_requirements.txt
index 51c73451f..38887e32d 100644
--- a/ci/aarch64.conda_requirements.txt
+++ b/ci/aarch64.conda_requirements.txt
@@ -5,3 +5,4 @@ numpy >= 1.17.0
 pandas >= 1.5.0
 scipy >= 1.9.0
 h5py >= 3.6.0
+statsmodels >= 0.14.0
\ No newline at end of file
diff --git a/ci/conda_requirements.txt b/ci/conda_requirements.txt
index fc36dea2e..4c349f484 100644
--- a/ci/conda_requirements.txt
+++ b/ci/conda_requirements.txt
@@ -6,3 +6,4 @@ pandas >= 1.5.0
 scipy >= 1.9.0
 h5py >= 3.6.0
 biom-format >= 2.1.16
+statsmodels >= 0.14.0
\ No newline at end of file
diff --git a/ci/requirements.doc.txt b/ci/requirements.doc.txt
index bd8119df9..18d41190e 100644
--- a/ci/requirements.doc.txt
+++ b/ci/requirements.doc.txt
@@ -5,3 +5,4 @@ pydata-sphinx-theme
 sphinxcontrib-youtube
 numpydoc
 matplotlib
+statsmodels
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 81e65719f..8de6093b4 100644
--- a/setup.py
+++ b/setup.py
@@ -227,6 +227,7 @@ def check_bin(ccbin, source, allow_dash):
         "scipy >= 1.9.0",
         "h5py >= 3.6.0",
         "biom-format >= 2.1.16",
+        "statsmodels >= 0.14.0",
     ],
     classifiers=classifiers,
     package_data={
diff --git a/skbio/stats/composition.py b/skbio/stats/composition.py
index 9d799a4ec..3136260ea 100644
--- a/skbio/stats/composition.py
+++ b/skbio/stats/composition.py
@@ -88,6 +88,7 @@
 import scipy.stats
 from scipy.sparse import coo_matrix
 from scipy.stats import t, gmean
+from statsmodels.stats.weightstats import CompareMeans
 
 from skbio.stats.distance import DistanceMatrix
 from skbio.util import find_duplicates
@@ -1787,36 +1788,46 @@ def _welch_ttest(a, b):
     See Also
     --------
     scipy.stats.ttest_ind
+    statsmodels.stats.weightstats.CompareMeans
 
     Notes
     -----
     Compared with ``scipy.stats.ttest_ind`` with ``equal_var=False``, this
-    function additionally returns confidence intervals.
+    function additionally returns confidence intervals. This implementation
+    uses the ``CompareMeans`` class from ``statsmodels.stats.weightstats``.
 
     """
     # See https://stats.stackexchange.com/a/475345
-    n1 = len(a)
-    n2 = len(b)
+    # See https://www.statsmodels.org/dev/generated/statsmodels.stats.weightstats.CompareMeans.html
+
+    # Creating a CompareMeans object to perform Welch's t-test
+    statsmodel_cm_object = CompareMeans.from_data(
+        data1=a, data2=b, weights1=None, weights2=None
+    )
+
+    # Performing Welch's t-test using the object to obtain tstat, pvalue, and df
+    ttest_cm_result = statsmodel_cm_object.ttest_ind(
+        alternative="two-sided", usevar="unequal", value=0
+    )
+
+    tstat = ttest_cm_result[0]
+    p = ttest_cm_result[1]
+    df = ttest_cm_result[2]
+
+    # Calculating difference between the two means
     m1 = np.mean(a)
     m2 = np.mean(b)
 
-    v1 = np.var(a, ddof=1)
-    v2 = np.var(b, ddof=1)
-
-    pooled_se = np.sqrt(v1 / n1 + v2 / n2)
     delta = m1 - m2
 
-    tstat = delta / pooled_se
-    df = (v1 / n1 + v2 / n2) ** 2 / (
-        v1**2 / (n1**2 * (n1 - 1)) + v2**2 / (n2**2 * (n2 - 1))
+    # Calculating confidence intervals using the aformentioned CompareMeans object
+    conf_int = statsmodel_cm_object.tconfint_diff(
+        alpha=0.05, alternative="two-sided", usevar="unequal"
     )
 
-    # two-sided t-test
-    p = 2 * t.cdf(-abs(tstat), df)
+    lb = conf_int[0]
+    ub = conf_int[1]
 
-    # upper and lower bounds
-    lb = delta - t.ppf(0.975, df) * pooled_se
-    ub = delta + t.ppf(0.975, df) * pooled_se
     return pd.DataFrame(
         np.array([tstat, df, p, delta, lb, ub]).reshape(1, -1),
         columns=["T statistic", "df", "pvalue", "Difference", "CI(2.5)", "CI(97.5)"],