Skip to content

Commit 25afc34

Browse files
authored
Ensure make_meta doesn't hold ref to data (#9354)
Previously `make_meta(df)` would accidentally hold a reference to the original backing data (which is potentially large), resulting in higher than expected memory usage. We now perform a `copy` to ensure that the view is replaced by a new array, removing these references.
1 parent 377940b commit 25afc34

2 files changed

Lines changed: 24 additions & 2 deletions

File tree

dask/dataframe/backends.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,12 +65,15 @@ def _(x):
6565

6666
@make_meta_dispatch.register((pd.Series, pd.DataFrame))
6767
def _(x, index=None):
68-
return x.iloc[:0]
68+
out = x.iloc[:0].copy(deep=True)
69+
# index isn't copied by default in pandas, even if deep=true
70+
out.index = out.index.copy(deep=True)
71+
return out
6972

7073

7174
@make_meta_dispatch.register(pd.Index)
7275
def _(x, index=None):
73-
return x[0:0]
76+
return x[0:0].copy(deep=True)
7477

7578

7679
meta_object_types: tuple[type, ...] = (pd.Series, pd.DataFrame, pd.Index, pd.MultiIndex)

dask/dataframe/tests/test_utils_dataframe.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,17 +48,36 @@ def test_make_meta():
4848
assert len(meta) == 0
4949
assert (meta.dtypes == df.dtypes).all()
5050
assert isinstance(meta.index, type(df.index))
51+
# - ensure no references to original data arrays are kept
52+
for col in "abc":
53+
meta_pointer = meta[col].values.__array_interface__["data"][0]
54+
df_pointer = df[col].values.__array_interface__["data"][0]
55+
assert meta_pointer != df_pointer
56+
meta_pointer = meta.index.values.__array_interface__["data"][0]
57+
df_pointer = df.index.values.__array_interface__["data"][0]
58+
assert meta_pointer != df_pointer
5159

5260
# Pandas series
5361
meta = make_meta(df.a)
5462
assert len(meta) == 0
5563
assert meta.dtype == df.a.dtype
5664
assert isinstance(meta.index, type(df.index))
65+
# - ensure no references to original data arrays are kept
66+
meta_pointer = meta.values.__array_interface__["data"][0]
67+
df_pointer = df.a.values.__array_interface__["data"][0]
68+
assert meta_pointer != df_pointer
69+
meta_pointer = meta.index.values.__array_interface__["data"][0]
70+
df_pointer = df.index.values.__array_interface__["data"][0]
71+
assert meta_pointer != df_pointer
5772

5873
# Pandas index
5974
meta = make_meta(df.index)
6075
assert isinstance(meta, type(df.index))
6176
assert len(meta) == 0
77+
# - ensure no references to original data arrays are kept
78+
meta_pointer = meta.values.__array_interface__["data"][0]
79+
df_pointer = df.index.values.__array_interface__["data"][0]
80+
assert meta_pointer != df_pointer
6281

6382
# Dask object
6483
ddf = dd.from_pandas(df, npartitions=2)

0 commit comments

Comments
 (0)