diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 4920622a15f3f..291eed4174d32 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -94,7 +94,7 @@ Other enhancements - :meth:`DataFrame.reset_index` now accepts a ``names`` argument which renames the index names (:issue:`6878`) - :meth:`pd.concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`) - :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`) -- +- Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, and :meth:`DataFrame.cov` (:issue:`46560`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index dae6395d2eee4..1a28d9f44ae28 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9764,6 +9764,7 @@ def corr( self, method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson", min_periods: int = 1, + numeric_only: bool = True, ) -> DataFrame: """ Compute pairwise correlation of columns, excluding NA/null values. @@ -9784,6 +9785,10 @@ def corr( Minimum number of observations required per pair of columns to have a valid result. Currently only available for Pearson and Spearman correlation. + numeric_only : bool, default True + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 Returns ------- @@ -9823,10 +9828,13 @@ def corr( dogs 1.0 NaN cats NaN 1.0 """ # noqa:E501 - numeric_df = self._get_numeric_data() - cols = numeric_df.columns + if numeric_only: + data = self._get_numeric_data() + else: + data = self + cols = data.columns idx = cols.copy() - mat = numeric_df.to_numpy(dtype=float, na_value=np.nan, copy=False) + mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) if method == "pearson": correl = libalgos.nancorr(mat, minp=min_periods) @@ -9865,7 +9873,12 @@ def corr( return self._constructor(correl, index=idx, columns=cols) - def cov(self, min_periods: int | None = None, ddof: int | None = 1) -> DataFrame: + def cov( + self, + min_periods: int | None = None, + ddof: int | None = 1, + numeric_only: bool = True, + ) -> DataFrame: """ Compute pairwise covariance of columns, excluding NA/null values. @@ -9896,6 +9909,11 @@ def cov(self, min_periods: int | None = None, ddof: int | None = 1) -> DataFrame .. versionadded:: 1.1.0 + numeric_only : bool, default True + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + Returns ------- DataFrame @@ -9964,10 +9982,13 @@ def cov(self, min_periods: int | None = None, ddof: int | None = 1) -> DataFrame b NaN 1.248003 0.191417 c -0.150812 0.191417 0.895202 """ - numeric_df = self._get_numeric_data() - cols = numeric_df.columns + if numeric_only: + data = self._get_numeric_data() + else: + data = self + cols = data.columns idx = cols.copy() - mat = numeric_df.to_numpy(dtype=float, na_value=np.nan, copy=False) + mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) if notna(mat).all(): if min_periods is not None and min_periods > len(mat): @@ -9981,7 +10002,14 @@ def cov(self, min_periods: int | None = None, ddof: int | None = 1) -> DataFrame return self._constructor(base_cov, index=idx, columns=cols) - def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Series: + def corrwith( + self, + other, + axis: Axis = 0, + drop=False, + method="pearson", + numeric_only: bool = True, + ) -> Series: """ Compute pairwise correlation. @@ -10008,6 +10036,11 @@ def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Serie * callable: callable with input two 1d ndarrays and returning a float. + numeric_only : bool, default True + Include only `float`, `int` or `boolean` data. + + .. versionadded:: 1.5.0 + Returns ------- Series @@ -10039,7 +10072,10 @@ def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Serie dtype: float64 """ # noqa:E501 axis = self._get_axis_number(axis) - this = self._get_numeric_data() + if numeric_only: + this = self._get_numeric_data() + else: + this = self # GH46174: when other is a Series object and axis=0, we achieve a speedup over # passing .corr() to .apply() by taking the columns as ndarrays and iterating @@ -10052,19 +10088,23 @@ def corrwith(self, other, axis: Axis = 0, drop=False, method="pearson") -> Serie if isinstance(other, Series): if axis == 0 and method in ["pearson", "spearman"]: corrs = {} - numeric_cols = self.select_dtypes(include=np.number).columns - ndf = self[numeric_cols].values.transpose() + if numeric_only: + cols = self.select_dtypes(include=np.number).columns + ndf = self[cols].values.transpose() + else: + cols = self.columns + ndf = self.values.transpose() k = other.values if method == "pearson": for i, r in enumerate(ndf): nonnull_mask = ~np.isnan(r) & ~np.isnan(k) - corrs[numeric_cols[i]] = np.corrcoef( - r[nonnull_mask], k[nonnull_mask] - )[0, 1] + corrs[cols[i]] = np.corrcoef(r[nonnull_mask], k[nonnull_mask])[ + 0, 1 + ] else: for i, r in enumerate(ndf): nonnull_mask = ~np.isnan(r) & ~np.isnan(k) - corrs[numeric_cols[i]] = np.corrcoef( + corrs[cols[i]] = np.corrcoef( r[nonnull_mask].argsort().argsort(), k[nonnull_mask].argsort().argsort(), )[0, 1] diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 2e545942b6f46..3a86aa05fb227 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -83,6 +83,20 @@ def test_cov_nullable_integer(self, other_column): expected = DataFrame(arr, columns=["a", "b"], index=["a", "b"]) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("numeric_only", [True, False]) + def test_cov_numeric_only(self, numeric_only): + # when dtypes of pandas series are different + # then ndarray will have dtype=object, + # so it need to be properly handled + df = DataFrame({"a": [1, 0], "c": ["x", "y"]}) + expected = DataFrame(0.5, index=["a"], columns=["a"]) + if numeric_only: + result = df.cov(numeric_only=numeric_only) + tm.assert_frame_equal(result, expected) + else: + with pytest.raises(ValueError, match="could not convert string to float"): + df.cov(numeric_only=numeric_only) + class TestDataFrameCorr: # DataFrame.corr(), as opposed to DataFrame.corrwith @@ -235,6 +249,22 @@ def test_corr_min_periods_greater_than_length(self, method): ) tm.assert_frame_equal(result, expected) + @td.skip_if_no_scipy + @pytest.mark.parametrize("meth", ["pearson", "kendall", "spearman"]) + @pytest.mark.parametrize("numeric_only", [True, False]) + def test_corr_numeric_only(self, meth, numeric_only): + # when dtypes of pandas series are different + # then ndarray will have dtype=object, + # so it need to be properly handled + df = DataFrame({"a": [1, 0], "b": [1, 0], "c": ["x", "y"]}) + expected = DataFrame(np.ones((2, 2)), index=["a", "b"], columns=["a", "b"]) + if numeric_only: + result = df.corr(meth, numeric_only=numeric_only) + tm.assert_frame_equal(result, expected) + else: + with pytest.raises(ValueError, match="could not convert string to float"): + df.corr(meth, numeric_only=numeric_only) + class TestDataFrameCorrWith: def test_corrwith(self, datetime_frame): @@ -300,16 +330,21 @@ def test_corrwith_matches_corrcoef(self): tm.assert_almost_equal(c1, c2) assert c1 < 1 - def test_corrwith_mixed_dtypes(self): + @pytest.mark.parametrize("numeric_only", [True, False]) + def test_corrwith_mixed_dtypes(self, numeric_only): # GH#18570 df = DataFrame( {"a": [1, 4, 3, 2], "b": [4, 6, 7, 3], "c": ["a", "b", "c", "d"]} ) s = Series([0, 6, 7, 3]) - result = df.corrwith(s) - corrs = [df["a"].corr(s), df["b"].corr(s)] - expected = Series(data=corrs, index=["a", "b"]) - tm.assert_series_equal(result, expected) + if numeric_only: + result = df.corrwith(s, numeric_only=numeric_only) + corrs = [df["a"].corr(s), df["b"].corr(s)] + expected = Series(data=corrs, index=["a", "b"]) + tm.assert_series_equal(result, expected) + else: + with pytest.raises(TypeError, match="not supported for the input types"): + df.corrwith(s, numeric_only=numeric_only) def test_corrwith_index_intersection(self): df1 = DataFrame(np.random.random(size=(10, 2)), columns=["a", "b"])