Skip to content

Commit

Permalink
[SPARK-43873][PS] Enabling FrameDescribeTests
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?

This PR proposes to enable the test `FrameDescribeTests`.

### Why are the changes needed?

To increate test coverage for pandas API on Spark with pandas 2.0.0 and above.

### Does this PR introduce _any_ user-facing change?

No, it's test-only.

### How was this patch tested?

Enabling the existing test.

Closes apache#42319 from itholic/pandas_describe.

Authored-by: itholic <[email protected]>
Signed-off-by: Hyukjin Kwon <[email protected]>
  • Loading branch information
itholic authored and HyukjinKwon committed Aug 4, 2023
1 parent 380c0f2 commit 26ed4fb
Showing 1 changed file with 9 additions and 30 deletions.
39 changes: 9 additions & 30 deletions python/pyspark/pandas/tests/computation/test_describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,6 @@ def df_pair(self):
psdf = ps.from_pandas(pdf)
return pdf, psdf

@unittest.skipIf(
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
"TODO(SPARK-43556): Enable DataFrameSlowTests.test_describe for pandas 2.0.0.",
)
def test_describe(self):
pdf, psdf = self.df_pair

Expand Down Expand Up @@ -78,19 +74,10 @@ def test_describe(self):
}
)
pdf = psdf._to_pandas()
# NOTE: Set `datetime_is_numeric=True` for pandas:
# FutureWarning: Treating datetime data as categorical rather than numeric in
# `.describe` is deprecated and will be removed in a future version of pandas.
# Specify `datetime_is_numeric=True` to silence this
# warning and adopt the future behavior now.
# NOTE: Compare the result except percentiles, since we use approximate percentile
# so the result is different from pandas.
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
self.assert_eq(
psdf.describe().loc[["count", "mean", "min", "max"]],
pdf.describe(datetime_is_numeric=True)
.astype(str)
.loc[["count", "mean", "min", "max"]],
pdf.describe().astype(str).loc[["count", "mean", "min", "max"]],
)
else:
self.assert_eq(
Expand Down Expand Up @@ -136,17 +123,13 @@ def test_describe(self):
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
self.assert_eq(
psdf.describe().loc[["count", "mean", "min", "max"]],
pdf.describe(datetime_is_numeric=True)
.astype(str)
.loc[["count", "mean", "min", "max"]],
pdf.describe().astype(str).loc[["count", "mean", "min", "max"]],
)
psdf.A += psdf.A
pdf.A += pdf.A
self.assert_eq(
psdf.describe().loc[["count", "mean", "min", "max"]],
pdf.describe(datetime_is_numeric=True)
.astype(str)
.loc[["count", "mean", "min", "max"]],
pdf.describe().astype(str).loc[["count", "mean", "min", "max"]],
)
else:
expected_result = ps.DataFrame(
Expand Down Expand Up @@ -187,15 +170,15 @@ def test_describe(self):
)
pdf = psdf._to_pandas()
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
pandas_result = pdf.describe(datetime_is_numeric=True)
pandas_result = pdf.describe()
pandas_result.B = pandas_result.B.astype(str)
self.assert_eq(
psdf.describe().loc[["count", "mean", "min", "max"]],
pandas_result.loc[["count", "mean", "min", "max"]],
)
psdf.A += psdf.A
pdf.A += pdf.A
pandas_result = pdf.describe(datetime_is_numeric=True)
pandas_result = pdf.describe()
pandas_result.B = pandas_result.B.astype(str)
self.assert_eq(
psdf.describe().loc[["count", "mean", "min", "max"]],
Expand Down Expand Up @@ -252,7 +235,7 @@ def test_describe(self):
)
pdf = psdf._to_pandas()
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
pandas_result = pdf.describe(datetime_is_numeric=True)
pandas_result = pdf.describe()
pandas_result.b = pandas_result.b.astype(str)
self.assert_eq(
psdf.describe().loc[["count", "mean", "min", "max"]],
Expand Down Expand Up @@ -288,10 +271,6 @@ def test_describe(self):
with self.assertRaisesRegex(ValueError, msg):
psdf.describe()

@unittest.skipIf(
LooseVersion(pd.__version__) >= LooseVersion("2.0.0"),
"TODO(SPARK-43556): Enable DataFrameSlowTests.test_describe for pandas 2.0.0.",
)
def test_describe_empty(self):
# Empty DataFrame
psdf = ps.DataFrame(columns=["A", "B"])
Expand Down Expand Up @@ -328,7 +307,7 @@ def test_describe_empty(self):
# For timestamp type, we should convert NaT to None in pandas result
# since pandas API on Spark doesn't support the NaT for object type.
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
pdf_result = pdf[pdf.a != pdf.a].describe(datetime_is_numeric=True)
pdf_result = pdf[pdf.a != pdf.a].describe()
self.assert_eq(
psdf[psdf.a != psdf.a].describe(),
pdf_result.where(pdf_result.notnull(), None).astype(str),
Expand Down Expand Up @@ -367,7 +346,7 @@ def test_describe_empty(self):
)
pdf = psdf._to_pandas()
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
pdf_result = pdf[pdf.a != pdf.a].describe(datetime_is_numeric=True)
pdf_result = pdf[pdf.a != pdf.a].describe()
pdf_result.b = pdf_result.b.where(pdf_result.b.notnull(), None).astype(str)
self.assert_eq(
psdf[psdf.a != psdf.a].describe(),
Expand Down Expand Up @@ -417,7 +396,7 @@ def test_describe_empty(self):
)
pdf = psdf._to_pandas()
if LooseVersion(pd.__version__) >= LooseVersion("1.1.0"):
pdf_result = pdf[pdf.a != pdf.a].describe(datetime_is_numeric=True)
pdf_result = pdf[pdf.a != pdf.a].describe()
self.assert_eq(
psdf[psdf.a != psdf.a].describe(),
pdf_result.where(pdf_result.notnull(), None).astype(str),
Expand Down

0 comments on commit 26ed4fb

Please sign in to comment.