Skip to content

Commit dcd008f

Browse files
authored
Update datetime_is_numeric behavior in describe for pandas 2.0 (#9868)
1 parent 1476f0f commit dcd008f

2 files changed

Lines changed: 50 additions & 21 deletions

File tree

dask/dataframe/core.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2901,16 +2901,26 @@ def describe(
29012901
percentiles_method="default",
29022902
include=None,
29032903
exclude=None,
2904-
datetime_is_numeric=False,
2904+
datetime_is_numeric=no_default,
29052905
):
2906-
2907-
if PANDAS_GT_110:
2906+
if PANDAS_GT_200:
2907+
if datetime_is_numeric is no_default:
2908+
datetime_is_numeric = True
2909+
datetime_is_numeric_kwarg = {}
2910+
else:
2911+
raise TypeError(
2912+
"datetime_is_numeric is removed in pandas>=2.0.0, datetime data will always be "
2913+
"summarized as numeric"
2914+
)
2915+
elif PANDAS_GT_110:
2916+
datetime_is_numeric = False if datetime_is_numeric is no_default else True
29082917
datetime_is_numeric_kwarg = {"datetime_is_numeric": datetime_is_numeric}
2909-
elif datetime_is_numeric:
2918+
elif datetime_is_numeric is True:
29102919
raise NotImplementedError(
2911-
"datetime_is_numeric=True is only supported for pandas >= 1.1.0"
2920+
"datetime_is_numeric=True is only supported for pandas >= 1.1.0, < 2.0.0"
29122921
)
29132922
else:
2923+
datetime_is_numeric = False
29142924
datetime_is_numeric_kwarg = {}
29152925

29162926
if self._meta.ndim == 1:
@@ -3105,15 +3115,10 @@ def _describe_nonnumeric_1d(
31053115
}
31063116
graph = HighLevelGraph.from_collections(name, layer, dependencies=stats)
31073117

3108-
if PANDAS_GT_110:
3118+
if PANDAS_GT_110 and not PANDAS_GT_200:
31093119
datetime_is_numeric_kwarg = {"datetime_is_numeric": datetime_is_numeric}
3110-
elif datetime_is_numeric:
3111-
raise NotImplementedError(
3112-
"datetime_is_numeric=True is only supported for pandas >= 1.1.0"
3113-
)
31143120
else:
31153121
datetime_is_numeric_kwarg = {}
3116-
31173122
meta = data._meta_nonempty.describe(**datetime_is_numeric_kwarg)
31183123
return new_dd_object(graph, name, meta, divisions=[None, None])
31193124

dask/dataframe/tests/test_dataframe.py

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,18 @@
6767
CHECK_FREQ["check_freq"] = False
6868

6969

70+
def _drop_mean(df, col=None):
71+
"""TODO: In pandas 2.0, mean is implemented for datetimes, but Dask returns None."""
72+
if isinstance(df, pd.DataFrame):
73+
df.at["mean", col] = np.nan
74+
df.dropna(how="all", inplace=True)
75+
elif isinstance(df, pd.Series):
76+
df.drop(labels=["mean"], inplace=True, errors="ignore")
77+
else:
78+
raise NotImplementedError("Expected Series or DataFrame with mean")
79+
return df
80+
81+
7082
def test_dataframe_doc():
7183
doc = d.add.__doc__
7284
disclaimer = "Some inconsistencies with the Dask version may exist."
@@ -501,7 +513,7 @@ def test_describe(include, exclude, percentiles, subset):
501513

502514
ddf = dd.from_pandas(df, 2)
503515

504-
if PANDAS_GT_110:
516+
if PANDAS_GT_110 and not PANDAS_GT_200:
505517
datetime_is_numeric_kwarg = {"datetime_is_numeric": True}
506518
else:
507519
datetime_is_numeric_kwarg = {}
@@ -520,9 +532,8 @@ def test_describe(include, exclude, percentiles, subset):
520532
**datetime_is_numeric_kwarg,
521533
)
522534

523-
if "e" in expected and datetime_is_numeric_kwarg:
524-
expected.at["mean", "e"] = np.nan
525-
expected.dropna(how="all", inplace=True)
535+
if "e" in expected and (datetime_is_numeric_kwarg or PANDAS_GT_200):
536+
expected = _drop_mean(expected, "e")
526537

527538
assert_eq(actual, expected)
528539

@@ -532,8 +543,8 @@ def test_describe(include, exclude, percentiles, subset):
532543
expected = df[col].describe(
533544
include=include, exclude=exclude, **datetime_is_numeric_kwarg
534545
)
535-
if col == "e" and datetime_is_numeric_kwarg:
536-
expected.drop("mean", inplace=True)
546+
if col == "e" and (datetime_is_numeric_kwarg or PANDAS_GT_200):
547+
expected = _drop_mean(expected)
537548
actual = ddf[col].describe(
538549
include=include, exclude=exclude, **datetime_is_numeric_kwarg
539550
)
@@ -560,13 +571,25 @@ def test_describe_without_datetime_is_numeric():
560571
ddf = dd.from_pandas(df, 2)
561572

562573
# Assert
563-
assert_eq(ddf.describe(), df.describe())
574+
expected = df.describe()
575+
if PANDAS_GT_200:
576+
expected = _drop_mean(expected, "e")
577+
578+
assert_eq(ddf.describe(), expected)
564579

565580
# Check series
566581
for col in ["a", "c"]:
567582
assert_eq(df[col].describe(), ddf[col].describe())
568583

569-
if PANDAS_GT_110:
584+
if PANDAS_GT_200:
585+
expected = _drop_mean(df.e.describe())
586+
assert_eq(expected, ddf.e.describe())
587+
with pytest.raises(
588+
TypeError,
589+
match="datetime_is_numeric is removed in pandas>=2.0.0",
590+
):
591+
ddf.e.describe(datetime_is_numeric=True)
592+
elif PANDAS_GT_110:
570593
with pytest.warns(
571594
FutureWarning,
572595
match=(
@@ -575,10 +598,11 @@ def test_describe_without_datetime_is_numeric():
575598
):
576599
ddf.e.describe()
577600
else:
578-
assert_eq(df.e.describe(), ddf.e.describe())
601+
expected = _drop_mean(df.e.describe())
602+
assert_eq(expected, ddf.e.describe())
579603
with pytest.raises(
580604
NotImplementedError,
581-
match="datetime_is_numeric=True is only supported for pandas >= 1.1.0",
605+
match="datetime_is_numeric=True is only supported for pandas >= 1.1.0, < 2.0.0",
582606
):
583607
ddf.e.describe(datetime_is_numeric=True)
584608

0 commit comments

Comments
 (0)