Sparse array reductions (#9342)

Ian Rose · web-flow · commit 8b95f983c232 · 2022-08-11T22:26:14.000-05:00
diff --git a/dask/array/backends.py b/dask/array/backends.py
@@ -1,10 +1,15 @@
+import math
+
 import numpy as np
 
+from dask.array import chunk
 from dask.array.dispatch import (
     concatenate_lookup,
     divide_lookup,
     einsum_lookup,
     empty_lookup,
+    nannumel_lookup,
+    numel_lookup,
     percentile_lookup,
     tensordot_lookup,
 )
@@ -112,6 +117,8 @@ def _tensordot(a, b, axes=2):
 
 @tensordot_lookup.register_lazy("cupy")
 @concatenate_lookup.register_lazy("cupy")
+@nannumel_lookup.register_lazy("cupy")
+@numel_lookup.register_lazy("cupy")
 def register_cupy():
     import cupy
 
@@ -120,6 +127,8 @@ def register_cupy():
     concatenate_lookup.register(cupy.ndarray, cupy.concatenate)
     tensordot_lookup.register(cupy.ndarray, cupy.tensordot)
     percentile_lookup.register(cupy.ndarray, percentile)
+    numel_lookup.register(cupy.ndarray, _numel_arraylike)
+    nannumel_lookup.register(cupy.ndarray, _nannumel)
 
     @einsum_lookup.register(cupy.ndarray)
     def _cupy_einsum(*args, **kwargs):
@@ -160,11 +169,18 @@ def _concat_cupy_sparse(L, axis=0):
 
 @tensordot_lookup.register_lazy("sparse")
 @concatenate_lookup.register_lazy("sparse")
+@nannumel_lookup.register_lazy("sparse")
+@numel_lookup.register_lazy("sparse")
 def register_sparse():
     import sparse
 
     concatenate_lookup.register(sparse.COO, sparse.concatenate)
     tensordot_lookup.register(sparse.COO, sparse.tensordot)
+    # Enforce dense ndarray for the numel result, since the sparse
+    # array will wind up being dense with an unpredictable fill_value.
+    # https://github.com/dask/dask/issues/7169
+    numel_lookup.register(sparse.COO, _numel_ndarray)
+    nannumel_lookup.register(sparse.COO, _nannumel_sparse)
 
 
 @tensordot_lookup.register_lazy("scipy")
@@ -203,3 +219,80 @@ def _tensordot_scipy_sparse(a, b, axes):
         return a * b
     elif a_axis == 1 and b_axis == 1:
         return a * b.T
+
+
+@numel_lookup.register(np.ma.masked_array)
+def _numel_masked(x, **kwargs):
+    """Numel implementation for masked arrays."""
+    return chunk.sum(np.ones_like(x), **kwargs)
+
+
+@numel_lookup.register((object, np.ndarray))
+def _numel_ndarray(x, **kwargs):
+    """Numel implementation for arrays that want to return numel of type ndarray."""
+    return _numel(x, coerce_np_ndarray=True, **kwargs)
+
+
+def _numel_arraylike(x, **kwargs):
+    """Numel implementation for arrays that want to return numel of the same type."""
+    return _numel(x, coerce_np_ndarray=False, **kwargs)
+
+
+def _numel(x, coerce_np_ndarray: bool, **kwargs):
+    """
+    A reduction to count the number of elements.
+
+    This has an additional kwarg in coerce_np_ndarray, which determines
+    whether to ensure that the resulting array is a numpy.ndarray, or whether
+    we allow it to be other array types via `np.full_like`.
+    """
+    shape = x.shape
+    keepdims = kwargs.get("keepdims", False)
+    axis = kwargs.get("axis", None)
+    dtype = kwargs.get("dtype", np.float64)
+
+    if axis is None:
+        prod = np.prod(shape, dtype=dtype)
+        if keepdims is False:
+            return prod
+
+        if coerce_np_ndarray:
+            return np.full(shape=(1,) * len(shape), fill_value=prod, dtype=dtype)
+        else:
+            return np.full_like(x, prod, shape=(1,) * len(shape), dtype=dtype)
+
+    if not isinstance(axis, (tuple, list)):
+        axis = [axis]
+
+    prod = math.prod(shape[dim] for dim in axis)
+    if keepdims is True:
+        new_shape = tuple(
+            shape[dim] if dim not in axis else 1 for dim in range(len(shape))
+        )
+    else:
+        new_shape = tuple(shape[dim] for dim in range(len(shape)) if dim not in axis)
+
+    if coerce_np_ndarray:
+        return np.broadcast_to(np.array(prod, dtype=dtype), new_shape)
+    else:
+        return np.full_like(x, prod, shape=new_shape, dtype=dtype)
+
+
+@nannumel_lookup.register((object, np.ndarray))
+def _nannumel(x, **kwargs):
+    """A reduction to count the number of elements, excluding nans"""
+    return chunk.sum(~(np.isnan(x)), **kwargs)
+
+
+def _nannumel_sparse(x, **kwargs):
+    """
+    A reduction to count the number of elements in a sparse array, excluding nans.
+    This will in general result in a dense matrix with an unpredictable fill value.
+    So make it official and convert it to dense.
+
+    https://github.com/dask/dask/issues/7169
+    """
+    n = _nannumel(x, **kwargs)
+    # If all dimensions are contracted, this will just be a number, otherwise we
+    # want to densify it.
+    return n.todense() if hasattr(n, "todense") else n
diff --git a/dask/array/dispatch.py b/dask/array/dispatch.py
@@ -12,3 +12,5 @@
 empty_lookup = Dispatch("empty")
 divide_lookup = Dispatch("divide")
 percentile_lookup = Dispatch("percentile")
+numel_lookup = Dispatch("numel")
+nannumel_lookup = Dispatch("nannumel")
diff --git a/dask/array/reductions.py b/dask/array/reductions.py
@@ -23,9 +23,7 @@
     unknown_chunk_message,
 )
 from dask.array.creation import arange, diagonal
-
-# Keep empty_lookup here for backwards compatibility
-from dask.array.dispatch import divide_lookup, empty_lookup  # noqa: F401
+from dask.array.dispatch import divide_lookup, nannumel_lookup, numel_lookup
 from dask.array.utils import (
     array_safe,
     asarray_safe,
@@ -54,6 +52,14 @@ def divide(a, b, dtype=None):
     return f(a, b, dtype=dtype)
 
 
+def numel(x, **kwargs):
+    return numel_lookup(x, **kwargs)
+
+
+def nannumel(x, **kwargs):
+    return nannumel_lookup(x, **kwargs)
+
+
 def reduction(
     x,
     chunk,
@@ -638,43 +644,6 @@ def _nanmax_skip(x_chunk, axis, keepdims):
         )
 
 
-def numel(x, **kwargs):
-    """A reduction to count the number of elements"""
-
-    if hasattr(x, "mask"):
-        return chunk.sum(np.ones_like(x), **kwargs)
-
-    shape = x.shape
-    keepdims = kwargs.get("keepdims", False)
-    axis = kwargs.get("axis", None)
-    dtype = kwargs.get("dtype", np.float64)
-
-    if axis is None:
-        prod = np.prod(shape, dtype=dtype)
-        return (
-            np.full_like(x, prod, shape=(1,) * len(shape), dtype=dtype)
-            if keepdims is True
-            else prod
-        )
-
-    if not isinstance(axis, tuple or list):
-        axis = [axis]
-
-    prod = math.prod(shape[dim] for dim in axis)
-    if keepdims is True:
-        new_shape = tuple(
-            shape[dim] if dim not in axis else 1 for dim in range(len(shape))
-        )
-    else:
-        new_shape = tuple(shape[dim] for dim in range(len(shape)) if dim not in axis)
-    return np.full_like(x, prod, shape=new_shape, dtype=dtype)
-
-
-def nannumel(x, **kwargs):
-    """A reduction to count the number of elements"""
-    return chunk.sum(~(np.isnan(x)), **kwargs)
-
-
 def mean_chunk(
     x, sum=chunk.sum, numel=numel, dtype="f8", computing_meta=False, **kwargs
 ):
diff --git a/dask/array/tests/test_reductions.py b/dask/array/tests/test_reductions.py
@@ -17,31 +17,44 @@
 
 @pytest.mark.parametrize("dtype", ["f4", "i4"])
 @pytest.mark.parametrize("keepdims", [True, False])
-def test_numel(dtype, keepdims):
+@pytest.mark.parametrize("nan", [True, False])
+def test_numel(dtype, keepdims, nan):
     x = np.ones((2, 3, 4))
+    if nan:
+        y = np.random.uniform(-1, 1, size=(2, 3, 4))
+        x[y < 0] = np.nan
+        numel = da.reductions.nannumel
+
+        def _sum(arr, **kwargs):
+            n = np.sum(np.ma.masked_where(np.isnan(arr), arr), **kwargs)
+            return n.filled(0) if isinstance(n, np.ma.MaskedArray) else n
+
+    else:
+        numel = da.reductions.numel
+        _sum = np.sum
 
     assert_eq(
-        da.reductions.numel(x, axis=(), keepdims=keepdims, dtype=dtype),
-        np.sum(x, axis=(), keepdims=keepdims, dtype=dtype),
+        numel(x, axis=(), keepdims=keepdims, dtype=dtype),
+        _sum(x, axis=(), keepdims=keepdims, dtype=dtype),
     )
     assert_eq(
-        da.reductions.numel(x, axis=0, keepdims=keepdims, dtype=dtype),
-        np.sum(x, axis=0, keepdims=keepdims, dtype=dtype),
+        numel(x, axis=0, keepdims=keepdims, dtype=dtype),
+        _sum(x, axis=0, keepdims=keepdims, dtype=dtype),
     )
 
     for length in range(x.ndim):
         for sub in itertools.combinations([d for d in range(x.ndim)], length):
             assert_eq(
-                da.reductions.numel(x, axis=sub, keepdims=keepdims, dtype=dtype),
-                np.sum(x, axis=sub, keepdims=keepdims, dtype=dtype),
+                numel(x, axis=sub, keepdims=keepdims, dtype=dtype),
+                _sum(x, axis=sub, keepdims=keepdims, dtype=dtype),
             )
 
     for length in range(x.ndim):
         for sub in itertools.combinations([d for d in range(x.ndim)], length):
             ssub = np.random.shuffle(list(sub))
             assert_eq(
-                da.reductions.numel(x, axis=ssub, keepdims=keepdims, dtype=dtype),
-                np.sum(x, axis=ssub, keepdims=keepdims, dtype=dtype),
+                numel(x, axis=ssub, keepdims=keepdims, dtype=dtype),
+                _sum(x, axis=ssub, keepdims=keepdims, dtype=dtype),
             )
 
 
diff --git a/dask/array/tests/test_sparse.py b/dask/array/tests/test_sparse.py
@@ -1,11 +1,10 @@
-import random
-
 import numpy as np
 import pytest
 from packaging.version import parse as parse_version
 
 import dask
 import dask.array as da
+from dask.array.reductions import nannumel, numel
 from dask.array.utils import assert_eq
 
 sparse = pytest.importorskip("sparse")
@@ -30,20 +29,26 @@
     lambda x: x[:1, None, 1:3],
     lambda x: x.T,
     lambda x: da.transpose(x, (1, 2, 0)),
+    lambda x: da.nanmean(x),
+    lambda x: da.nanmean(x, axis=1),
+    lambda x: da.nanmax(x),
+    lambda x: da.nanmin(x),
+    lambda x: da.nanprod(x),
+    lambda x: da.nanstd(x),
+    lambda x: da.nanvar(x),
+    lambda x: da.nansum(x),
+    # These nan* variants are are not implemented by sparse.COO
+    # lambda x: da.median(x, axis=0),
+    # lambda x: da.nanargmax(x),
+    # lambda x: da.nanargmin(x),
+    # lambda x: da.nancumprod(x, axis=0),
+    # lambda x: da.nancumsum(x, axis=0),
     lambda x: x.sum(),
     lambda x: x.moment(order=0),
-    pytest.param(
-        lambda x: x.mean(),
-        marks=pytest.mark.xfail(reason="https://github.com/dask/dask/issues/7169"),
-    ),
-    pytest.param(
-        lambda x: x.std(),
-        marks=pytest.mark.xfail(reason="https://github.com/dask/dask/issues/7169"),
-    ),
-    pytest.param(
-        lambda x: x.var(),
-        marks=pytest.mark.xfail(reason="https://github.com/dask/dask/issues/7169"),
-    ),
+    lambda x: x.mean(),
+    lambda x: x.mean(axis=1),
+    lambda x: x.std(),
+    lambda x: x.var(),
     lambda x: x.dot(np.arange(x.shape[-1])),
     lambda x: x.dot(np.eye(x.shape[-1])),
     lambda x: da.tensordot(x, np.ones(x.shape[:2]), axes=[(0, 1), (0, 1)]),
@@ -125,56 +130,6 @@ def test_tensordot():
     )
 
 
-@pytest.mark.xfail(reason="upstream change", strict=False)
-@pytest.mark.parametrize("func", functions)
-def test_mixed_concatenate(func):
-    x = da.random.random((2, 3, 4), chunks=(1, 2, 2))
-
-    y = da.random.random((2, 3, 4), chunks=(1, 2, 2))
-    y[y < 0.8] = 0
-    yy = y.map_blocks(sparse.COO.from_numpy)
-
-    d = da.concatenate([x, y], axis=0)
-    s = da.concatenate([x, yy], axis=0)
-
-    dd = func(d)
-    ss = func(s)
-
-    assert_eq(dd, ss)
-
-
-@pytest.mark.xfail(reason="upstream change", strict=False)
-@pytest.mark.parametrize("func", functions)
-def test_mixed_random(func):
-    d = da.random.random((4, 3, 4), chunks=(1, 2, 2))
-    d[d < 0.7] = 0
-
-    fn = lambda x: sparse.COO.from_numpy(x) if random.random() < 0.5 else x
-    s = d.map_blocks(fn)
-
-    dd = func(d)
-    ss = func(s)
-
-    assert_eq(dd, ss)
-
-
-@pytest.mark.xfail(reason="upstream change", strict=False)
-def test_mixed_output_type():
-    y = da.random.random((10, 10), chunks=(5, 5))
-    y[y < 0.8] = 0
-    y = y.map_blocks(sparse.COO.from_numpy)
-
-    x = da.zeros((10, 1), chunks=(5, 1))
-
-    z = da.concatenate([x, y], axis=1)
-
-    assert z.shape == (10, 11)
-
-    zz = z.compute()
-    assert isinstance(zz, sparse.COO)
-    assert zz.nnz == y.compute().nnz
-
-
 def test_metadata():
     y = da.random.random((10, 10), chunks=(5, 5))
     y[y < 0.8] = 0
@@ -239,3 +194,18 @@ def test_meta_from_array():
     x = sparse.COO.from_numpy(np.eye(1))
     y = da.utils.meta_from_array(x, ndim=2)
     assert isinstance(y, sparse.COO)
+
+
+@pytest.mark.parametrize("numel", [numel, nannumel])
+@pytest.mark.parametrize("axis", [0, (0, 1), None])
+@pytest.mark.parametrize("keepdims", [True, False])
+def test_numel(numel, axis, keepdims):
+    x = np.random.random((2, 3, 4))
+    x[x < 0.8] = 0
+    x[x > 0.9] = np.nan
+
+    xs = sparse.COO.from_numpy(x, fill_value=0.0)
+
+    assert_eq(
+        numel(x, axis=axis, keepdims=keepdims), numel(xs, axis=axis, keepdims=keepdims)
+    )