Skip to content

Commit 0a7c41d

Browse files
authored
Feat: read entrypoints in dask.sizeof (#7688)
Entry-points are only used when there is no dispatch for a given type which eliminates the cost for applications which do not need entry-point based implementations.
1 parent bc3137d commit 0a7c41d

3 files changed

Lines changed: 106 additions & 0 deletions

File tree

dask/sizeof.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1+
import importlib.metadata
12
import itertools
3+
import logging
24
import random
35
import sys
46
from array import array
@@ -7,6 +9,8 @@
79

810
sizeof = Dispatch(name="sizeof")
911

12+
logger = logging.getLogger(__name__)
13+
1014

1115
@sizeof.register(object)
1216
def sizeof_default(o):
@@ -208,3 +212,23 @@ def sizeof_pyarrow_table(table):
208212
@sizeof.register(pa.ChunkedArray)
209213
def sizeof_pyarrow_chunked_array(data):
210214
return int(_get_col_size(data)) + 1000
215+
216+
217+
def _register_entry_point_plugins():
218+
"""Register sizeof implementations exposed by the entry_point mechanism."""
219+
if sys.version_info >= (3, 10):
220+
sizeof_entry_points = importlib.metadata.entry_points(group="dask.sizeof")
221+
else:
222+
sizeof_entry_points = importlib.metadata.entry_points().get("dask.sizeof", [])
223+
224+
for entry_point in sizeof_entry_points:
225+
registrar = entry_point.load()
226+
try:
227+
registrar(sizeof)
228+
except Exception:
229+
logger.exception(
230+
f"Failed to register sizeof entry point {entry_point.name}"
231+
)
232+
233+
234+
_register_entry_point_plugins()

dask/tests/test_sizeof.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1+
import os
12
import sys
23
from array import array
34

45
import pytest
56

7+
from dask.multiprocessing import get_context
68
from dask.sizeof import sizeof
79
from dask.utils import funcname
810

@@ -147,3 +149,47 @@ def test_dict():
147149
d = {i: x for i in range(100)}
148150
assert sizeof(d) > x.nbytes * 100
149151
assert isinstance(sizeof(d), int)
152+
153+
154+
def _get_sizeof_on_path(path, size):
155+
sys.path.append(os.fsdecode(path))
156+
157+
# Dask will have already called _register_entry_point_plugins
158+
# before we can modify sys.path, so we re-register here.
159+
import dask.sizeof
160+
161+
dask.sizeof._register_entry_point_plugins()
162+
163+
import class_impl
164+
165+
cls = class_impl.Impl(size)
166+
return sizeof(cls)
167+
168+
169+
def test_register_backend_entrypoint(tmp_path):
170+
# Create special sizeof implementation for a dummy class
171+
(tmp_path / "impl_sizeof.py").write_bytes(
172+
b"def sizeof_plugin(sizeof):\n"
173+
b' print("REG")\n'
174+
b' @sizeof.register_lazy("class_impl")\n'
175+
b" def register_impl():\n"
176+
b" import class_impl\n"
177+
b" @sizeof.register(class_impl.Impl)\n"
178+
b" def sizeof_impl(obj):\n"
179+
b" return obj.size \n"
180+
)
181+
# Define dummy class that possesses a size attribute
182+
(tmp_path / "class_impl.py").write_bytes(
183+
b"class Impl:\n def __init__(self, size):\n self.size = size"
184+
)
185+
dist_info = tmp_path / "impl_sizeof-0.0.0.dist-info"
186+
dist_info.mkdir()
187+
(dist_info / "entry_points.txt").write_bytes(
188+
b"[dask.sizeof]\nimpl = impl_sizeof:sizeof_plugin\n"
189+
)
190+
191+
with get_context().Pool(1) as pool:
192+
assert (
193+
pool.apply(_get_sizeof_on_path, args=(tmp_path, 3_14159265)) == 3_14159265
194+
)
195+
pool.join()
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
Extend `sizeof`
2+
============
3+
4+
When Dask needs to compute the size of an object in bytes, e.g. to determine which objects to spill to disk, it uses the ``dask.sizeof.sizeof`` registration mechanism. Users who need to define a ``sizeof`` implementation for their own objects can use ``sizeof.register``:
5+
6+
.. code-block:: python
7+
8+
>>> import numpy as np
9+
>>> from dask.sizeof import sizeof
10+
>>> @sizeof.register(np.ndarray)
11+
>>> def sizeof_numpy_like(array):
12+
... return array.nbytes
13+
14+
This code can be executed in order to register the implementation with Dask by placing it in one of the library's modules e.g. ``__init__.py``. However, this introduces a maintenance burden on the developers of these libraries, and must be manually imported on all workers in the event that these libraries do not accept the patch.
15+
16+
Therefore, Dask also exposes an `entrypoint <https://packaging.python.org/specifications/entry-points/>`_ under the group ``dask.sizeof`` to enable third-party libraries to develop and maintain these ``sizeof`` implementations.
17+
18+
For a fictitious library ``numpy_sizeof_dask.py``, the necessary ``setup.cfg`` configuration would be as follows:
19+
20+
.. code-block:: ini
21+
22+
[options.entry_points]
23+
dask.sizeof =
24+
numpy = numpy_sizeof_dask:sizeof_plugin
25+
26+
whilst ``numpy_sizeof_dask.py`` would contain
27+
28+
.. code-block:: python
29+
30+
>>> import numpy as np
31+
>>> def sizeof_plugin(sizeof):
32+
... @sizeof.register(np.ndarray)
33+
... def sizeof_numpy_like(array):
34+
... return array.nbytes
35+
36+
Upon the first import of `dask.sizeof`, Dask calls the entrypoint (``sizeof_plugin``) with the ``dask.sizeof.sizeof`` object, which can then be used to register a sizeof implementation.

0 commit comments

Comments
 (0)