From d930ce1ad26adb10f6911aa3f633c110015b3bc6 Mon Sep 17 00:00:00 2001 From: Luka Sturtewagen Date: Thu, 8 Apr 2021 14:46:22 +0200 Subject: [PATCH 1/5] implemented dataframe.cov --- databricks/koalas/frame.py | 105 ++++++++++++++++++++++ databricks/koalas/missing/frame.py | 1 - databricks/koalas/tests/test_dataframe.py | 22 +++++ 3 files changed, 127 insertions(+), 1 deletion(-) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index 6b44a2936..da449b07e 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -4325,6 +4325,111 @@ def op(kser): return self._apply_series_op(op) + def cov(self, min_periods: Optional[int] = None, ddof: Optional[int] = 1) -> "DataFrame": + """ + Compute pairwise covariance of columns, excluding NA/null values. + Compute the pairwise covariance among the series of a DataFrame. + The returned data frame is the `covariance matrix + `__ of the columns + of the DataFrame. + Both NA and null values are automatically excluded from the + calculation. (See the note below about bias from missing values.) + A threshold can be set for the minimum number of + observations for each value created. Comparisons with observations + below this threshold will be returned as ``NaN``. + This method is generally used for the analysis of time series data to + understand the relationship between different measures + across time. + Parameters + ---------- + min_periods : int, optional + Minimum number of observations required per pair of columns + to have a valid result. + ddof : int, default 1 + Delta degrees of freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + + Returns + ------- + DataFrame + The covariance matrix of the series of the DataFrame. + See Also + -------- + Series.cov : Compute covariance with another Series. + core.window.ExponentialMovingWindow.cov: Exponential weighted sample covariance. + core.window.Expanding.cov : Expanding sample covariance. + core.window.Rolling.cov : Rolling sample covariance. + Notes + ----- + Returns the covariance matrix of the DataFrame's time series. + The covariance is normalized by N-ddof. + For DataFrames that have Series that are missing data (assuming that + data is `missing at random + `__) + the returned covariance matrix will be an unbiased estimate + of the variance and covariance between the member Series. + However, for many applications this estimate may not be acceptable + because the estimate covariance matrix is not guaranteed to be positive + semi-definite. This could lead to estimate correlations having + absolute values which are greater than one, and/or a non-invertible + covariance matrix. See `Estimation of covariance matrices + `__ for more details. + Examples + -------- + >>> kdf = ks.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)], + ... columns=['dogs', 'cats']) + >>> kdf.cov() + dogs cats + dogs 0.666667 -1.000000 + cats -1.000000 1.666667 + >>> np.random.seed(42) + >>> kdf = ks.DataFrame(np.random.randn(1000, 5), + ... columns=['a', 'b', 'c', 'd', 'e']) + >>> kdf.cov() + a b c d e + a 0.998438 -0.020161 0.059277 -0.008943 0.014144 + b -0.020161 1.059352 -0.008543 -0.024738 0.009826 + c 0.059277 -0.008543 1.010670 -0.001486 -0.000271 + d -0.008943 -0.024738 -0.001486 0.921297 -0.013692 + e 0.014144 0.009826 -0.000271 -0.013692 0.977795 + **Minimum number of periods** + This method also supports an optional ``min_periods`` keyword + that specifies the required minimum number of non-NA observations for + each column pair in order to have a valid result: + >>> np.random.seed(42) + >>> kdf = ks.DataFrame(np.random.randn(20, 3), + ... columns=['a', 'b', 'c']) + >>> kdf.loc[:5, 'a'] = np.nan + >>> kdf.loc[5:10, 'b'] = np.nan + >>> kdf.cov(min_periods=12) + a b c + a 0.331350 NaN -0.148156 + b NaN 1.008785 0.164205 + c -0.148156 0.164205 0.895202 + """ + num_cols = [ + label + for label in self._internal.column_labels + if isinstance(self._internal.spark_type_for(label), (NumericType)) + ] + kdf = self[num_cols] + names = [name for t in num_cols for name in t] + mat = kdf.to_pandas().to_numpy(dtype=float, copy=False) + if DataFrame(mat).notna().all().all(): + if min_periods is not None and min_periods > len(mat): + base_cov = np.empty((mat.shape[1], mat.shape[1])) + base_cov.fill(np.nan) + else: + base_cov = np.cov(mat.T, ddof=ddof) + base_cov = base_cov.reshape((len(num_cols), len(num_cols))) + else: + from pandas.core.frame import libalgos + + base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods) + + return DataFrame(base_cov, index=names, columns=names) + def _mark_duplicates(self, subset=None, keep="first"): if subset is None: subset = self._internal.column_labels diff --git a/databricks/koalas/missing/frame.py b/databricks/koalas/missing/frame.py index e8cf8e796..f11f1315a 100644 --- a/databricks/koalas/missing/frame.py +++ b/databricks/koalas/missing/frame.py @@ -43,7 +43,6 @@ class _MissingPandasLikeDataFrame(object): compare = _unsupported_function("compare") convert_dtypes = _unsupported_function("convert_dtypes") corrwith = _unsupported_function("corrwith") - cov = _unsupported_function("cov") ewm = _unsupported_function("ewm") infer_objects = _unsupported_function("infer_objects") interpolate = _unsupported_function("interpolate") diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py index 90ac55c00..250d63bdc 100644 --- a/databricks/koalas/tests/test_dataframe.py +++ b/databricks/koalas/tests/test_dataframe.py @@ -5543,3 +5543,25 @@ def test_at_time(self): kdf = ks.DataFrame({"A": [1, 2, 3, 4]}) with self.assertRaisesRegex(TypeError, "Index must be DatetimeIndex"): kdf.at_time("0:15") + + def test_cov(self): + pdf = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)], columns=["dogs", "cats"]) + kdf = ks.from_pandas(pdf) + self.assert_eq( + pdf.cov(), kdf.cov(), + ) + + np.random.seed(42) + pdf = pd.DataFrame(np.random.randn(25, 5), columns=["a", "b", "c", "d", "e"]) + kdf = ks.from_pandas(pdf) + self.assert_eq( + pdf.cov(), kdf.cov(), almost=True, + ) + + pdf = pd.DataFrame(np.random.randn(20, 3), columns=["a", "b", "c"]) + pdf.loc[:5, "a"] = np.nan + pdf.loc[5:10, "b"] = np.nan + kdf = ks.from_pandas(pdf) + self.assert_eq( + pdf.cov(min_periods=12), kdf.cov(min_periods=12), + ) From 45d26da35e6df8e0131d1e320eda07a5bf905730 Mon Sep 17 00:00:00 2001 From: Luka Sturtewagen Date: Thu, 8 Apr 2021 18:07:04 +0200 Subject: [PATCH 2/5] fix docstring error --- databricks/koalas/frame.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index da449b07e..31040070e 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -4393,7 +4393,9 @@ def cov(self, min_periods: Optional[int] = None, ddof: Optional[int] = 1) -> "Da c 0.059277 -0.008543 1.010670 -0.001486 -0.000271 d -0.008943 -0.024738 -0.001486 0.921297 -0.013692 e 0.014144 0.009826 -0.000271 -0.013692 0.977795 + **Minimum number of periods** + This method also supports an optional ``min_periods`` keyword that specifies the required minimum number of non-NA observations for each column pair in order to have a valid result: From 79871927f2eacbeecf75fbf4e86bcb2514748e3c Mon Sep 17 00:00:00 2001 From: Luka Sturtewagen Date: Thu, 8 Apr 2021 18:17:11 +0200 Subject: [PATCH 3/5] fix whitespace --- databricks/koalas/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index 31040070e..dd3292a62 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -4393,9 +4393,9 @@ def cov(self, min_periods: Optional[int] = None, ddof: Optional[int] = 1) -> "Da c 0.059277 -0.008543 1.010670 -0.001486 -0.000271 d -0.008943 -0.024738 -0.001486 0.921297 -0.013692 e 0.014144 0.009826 -0.000271 -0.013692 0.977795 - + **Minimum number of periods** - + This method also supports an optional ``min_periods`` keyword that specifies the required minimum number of non-NA observations for each column pair in order to have a valid result: From 8b34a274fa5894c640ff9432564e6b53541b9b86 Mon Sep 17 00:00:00 2001 From: Luka Sturtewagen Date: Thu, 8 Apr 2021 20:33:46 +0200 Subject: [PATCH 4/5] add test for ddof --- databricks/koalas/tests/test_dataframe.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py index 250d63bdc..e435771be 100644 --- a/databricks/koalas/tests/test_dataframe.py +++ b/databricks/koalas/tests/test_dataframe.py @@ -5565,3 +5565,9 @@ def test_cov(self): self.assert_eq( pdf.cov(min_periods=12), kdf.cov(min_periods=12), ) + if LooseVersion(pd.__version__) > LooseVersion("1.1.0"): + df = pd.DataFrame(np.random.rand(10, 2), columns=["a", "b"]) + kdf = ks.from_pandas(pdf) + self.assert_eq( + pdf.cov(ddof=2), kdf.cov(ddof=2), almost=True, + ) From 6cd946c719d11e7740a90861b335854d3364a9a8 Mon Sep 17 00:00:00 2001 From: Luka Sturtewagen Date: Fri, 9 Apr 2021 12:01:43 +0200 Subject: [PATCH 5/5] added note --- databricks/koalas/frame.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py index dd3292a62..97f1d3f1b 100644 --- a/databricks/koalas/frame.py +++ b/databricks/koalas/frame.py @@ -4340,6 +4340,10 @@ def cov(self, min_periods: Optional[int] = None, ddof: Optional[int] = 1) -> "Da This method is generally used for the analysis of time series data to understand the relationship between different measures across time. + + .. note:: This method should only be used if the resulting pandas DataFrame is expected + to be small, as all the data is loaded into the driver's memory. + Parameters ---------- min_periods : int, optional