Enable automatic column projection for groupby aggregations (#9442)

rjzamora · web-flow · commit 19a51474c278 · 2022-08-30T15:50:17.000-05:00
diff --git a/dask/dataframe/groupby.py b/dask/dataframe/groupby.py
@@ -1654,6 +1654,7 @@ def get_group(self, key):
 
     @_aggregate_docstring()
     def aggregate(self, arg, split_every=None, split_out=1, shuffle=None):
+        column_projection = None
         if isinstance(self.obj, DataFrame):
             if isinstance(self.by, tuple) or np.isscalar(self.by):
                 group_columns = {self.by}
@@ -1681,6 +1682,10 @@ def aggregate(self, arg, split_every=None, split_out=1, shuffle=None):
 
             spec = _normalize_spec(arg, non_group_columns)
 
+            # Check if the aggregation involves implicit column projection
+            if isinstance(arg, dict):
+                column_projection = group_columns | arg.keys()
+
         elif isinstance(self.obj, Series):
             if isinstance(arg, (list, tuple, dict)):
                 # implementation detail: if self.obj is a series, a pseudo column
@@ -1709,11 +1714,17 @@ def aggregate(self, arg, split_every=None, split_out=1, shuffle=None):
         else:
             levels = 0
 
+        # Add an explicit `getitem` operation if the groupby
+        # aggregation involves implicit column projection.
+        # This makes it possible for the column-projection
+        # to be pushed into the IO layer
+        _obj = self.obj[list(column_projection)] if column_projection else self.obj
+
         if not isinstance(self.by, list):
-            chunk_args = [self.obj, self.by]
+            chunk_args = [_obj, self.by]
 
         else:
-            chunk_args = [self.obj] + self.by
+            chunk_args = [_obj] + self.by
 
         if not PANDAS_GT_110 and self.dropna:
             raise NotImplementedError(
diff --git a/dask/dataframe/tests/test_groupby.py b/dask/dataframe/tests/test_groupby.py
@@ -14,6 +14,7 @@
 from dask.dataframe.backends import grouper_dispatch
 from dask.dataframe.utils import assert_dask_graph, assert_eq, assert_max_deps
 from dask.utils import M
+from dask.utils_test import hlg_layer
 
 CHECK_FREQ = {}
 if dd._compat.PANDAS_GT_110:
@@ -1080,6 +1081,11 @@ def test_aggregate_dask():
         assert_max_deps(agg_dask1, 2)
         assert_max_deps(agg_dask2, 2)
 
+        # Make sure dict-based aggregation specs result in an
+        # explicit `getitem` layer to improve column projection
+        if isinstance(spec, dict):
+            assert hlg_layer(result1.dask, "getitem")
+
         # check for deterministic key names and values.
         # Require pickle since "partial" concat functions
         # used in tree-reduction cannot be compared
@@ -1090,7 +1096,11 @@ def test_aggregate_dask():
             # Note: List-based aggregation specs may result in
             # an extra delayed layer. This is because a "long" list
             # arg will be detected in `dask.array.core.normalize_arg`.
-            if isinstance(spec, list) == isinstance(other_spec, list):
+            # Also, dict-based aggregation specs will result in
+            # an extra `getitem` layer (to improve column projection)
+            if (isinstance(spec, list) == isinstance(other_spec, list)) and (
+                isinstance(spec, dict) == isinstance(other_spec, dict)
+            ):
                 other = ddf.groupby(["a", "b"]).agg(other_spec, split_every=2)
                 assert len(other.dask) == len(result1.dask)
                 assert len(other.dask) == len(result2.dask)