Remove statistics-based set_index logic from read_parquet (#9661)

rjzamora · web-flow · commit 945435bfebc2 · 2022-12-01T10:54:32.000+01:00
diff --git a/dask/dataframe/io/parquet/arrow.py b/dask/dataframe/io/parquet/arrow.py
@@ -995,7 +995,7 @@ def _create_dd_meta(cls, dataset_info, use_nullable_dtypes=False):
         columns = None
 
         # Use pandas metadata to update categories
-        pandas_metadata = _get_pandas_metadata(schema)
+        pandas_metadata = _get_pandas_metadata(schema) or {}
         if pandas_metadata:
             if categories is None:
                 categories = []
@@ -1021,7 +1021,15 @@ def _create_dd_meta(cls, dataset_info, use_nullable_dtypes=False):
 
         # Use index specified in the pandas metadata if
         # the index column was not specified by the user
-        if index is None and index_names:
+        if (
+            index is None
+            and index_names
+            and (
+                # Only set to `[None]` if pandas metadata includes an index
+                index_names != [None]
+                or pandas_metadata.get("index_columns", None)
+            )
+        ):
             index = index_names
 
         # Set proper index for meta
diff --git a/dask/dataframe/io/parquet/core.py b/dask/dataframe/io/parquet/core.py
@@ -509,7 +509,7 @@ def read_parquet(
         aggregation_depth = parts[0].pop("aggregation_depth", aggregation_depth)
 
     # Parse dataset statistics from metadata (if available)
-    parts, divisions, index, index_in_columns = process_statistics(
+    parts, divisions, index = process_statistics(
         parts,
         statistics,
         filters,
@@ -522,18 +522,10 @@ def read_parquet(
 
     # Account for index and columns arguments.
     # Modify `meta` dataframe accordingly
-    meta, index, columns = set_index_columns(
-        meta, index, columns, index_in_columns, auto_index_allowed
-    )
+    meta, index, columns = set_index_columns(meta, index, columns, auto_index_allowed)
     if meta.index.name == NONE_LABEL:
         meta.index.name = None
 
-    # Set the index that was previously treated as a column
-    if index_in_columns:
-        meta = meta.set_index(index)
-        if meta.index.name == NONE_LABEL:
-            meta.index.name = None
-
     if len(divisions) < 2:
         # empty dataframe - just use meta
         divisions = (None, None)
@@ -1209,11 +1201,13 @@ def get_engine(engine):
 #####################
 
 
-def sorted_columns(statistics):
+def sorted_columns(statistics, columns=None):
     """Find sorted columns given row-group statistics
 
-    This finds all columns that are sorted, along with appropriate divisions
-    values for those columns
+    This finds all columns that are sorted, along with the
+    appropriate ``divisions`` for those columns. If the (optional)
+    ``columns`` argument is used, the search will be restricted
+    to the specified column set.
 
     Returns
     -------
@@ -1224,6 +1218,8 @@ def sorted_columns(statistics):
 
     out = []
     for i, c in enumerate(statistics[0]["columns"]):
+        if columns and c["name"] not in columns:
+            continue
         if not all(
             "min" in s["columns"][i] and "max" in s["columns"][i] for s in statistics
         ):
@@ -1347,7 +1343,6 @@ def process_statistics(
     """Process row-group column statistics in metadata
     Used in read_parquet.
     """
-    index_in_columns = False
     if statistics and len(parts) != len(statistics):
         # It is up to the Engine to guarantee that these
         # lists are the same length (if statistics are defined).
@@ -1362,6 +1357,7 @@ def process_statistics(
         )
         statistics = []
 
+    divisions = None
     if statistics:
         result = list(
             zip(
@@ -1382,52 +1378,43 @@ def process_statistics(
                 parts, statistics, chunksize, split_row_groups, fs, aggregation_depth
             )
 
-        out = sorted_columns(statistics)
+        # Convert str index to list
+        index = [index] if isinstance(index, str) else index
 
-        if index and isinstance(index, str):
-            index = [index]
-        if index and out:
-            # Only one valid column
-            out = [o for o in out if o["name"] in index]
-        if index is not False and len(out) == 1:
-            # Use only sorted column with statistics as the index
-            divisions = out[0]["divisions"]
-            if index is None:
-                index_in_columns = True
-                index = [out[0]["name"]]
-            elif index != [out[0]["name"]]:
-                raise ValueError(f"Specified index is invalid.\nindex: {index}")
-        elif index is not False and len(out) > 1:
-            if any(o["name"] == NONE_LABEL for o in out):
-                # Use sorted column matching NONE_LABEL as the index
-                [o] = [o for o in out if o["name"] == NONE_LABEL]
-                divisions = o["divisions"]
-                if index is None:
-                    index = [o["name"]]
-                    index_in_columns = True
-                elif index != [o["name"]]:
-                    raise ValueError(f"Specified index is invalid.\nindex: {index}")
-            else:
-                # Multiple sorted columns found, cannot autodetect the index
+        # TODO: Remove `filters` criteria below after deprecation cycle.
+        # We can then remove the `sorted_col_names` logic and warning.
+        # See: https://github.com/dask/dask/pull/9661
+        process_columns = index if index and len(index) == 1 else None
+        if filters:
+            process_columns = None
+
+        # Use statistics to define divisions
+        if process_columns or filters:
+            sorted_col_names = []
+            for sorted_column_info in sorted_columns(
+                statistics, columns=process_columns
+            ):
+                if index and sorted_column_info["name"] in index:
+                    divisions = sorted_column_info["divisions"]
+                    break
+                else:
+                    # Filtered columns may also be sorted
+                    sorted_col_names.append(sorted_column_info["name"])
+
+            if index is None and sorted_col_names:
+                assert bool(filters)  # Should only get here when filtering
                 warnings.warn(
-                    "Multiple sorted columns found %s, cannot\n "
-                    "autodetect index. Will continue without an index.\n"
-                    "To pick an index column, use the index= keyword; to \n"
-                    "silence this warning use index=False."
-                    "" % [o["name"] for o in out],
-                    RuntimeWarning,
+                    f"Sorted columns detected: {sorted_col_names}\n"
+                    f"Use the `index` argument to set a sorted column as your "
+                    f"index to create a DataFrame collection with known `divisions`.",
+                    UserWarning,
                 )
-                index = False
-                divisions = [None] * (len(parts) + 1)
-        else:
-            divisions = [None] * (len(parts) + 1)
-    else:
-        divisions = [None] * (len(parts) + 1)
 
-    return parts, divisions, index, index_in_columns
+    divisions = divisions or (None,) * (len(parts) + 1)
+    return parts, divisions, index
 
 
-def set_index_columns(meta, index, columns, index_in_columns, auto_index_allowed):
+def set_index_columns(meta, index, columns, auto_index_allowed):
     """Handle index/column arguments, and modify `meta`
     Used in read_parquet.
     """
@@ -1471,18 +1458,7 @@ def set_index_columns(meta, index, columns, index_in_columns, auto_index_allowed
                     "index: {} | column: {}".format(index, columns)
                 )
 
-        # Leaving index as a column in `meta`, because the index
-        # will be reset below (in case the index was detected after
-        # meta was created)
-        if index_in_columns:
-            meta = meta[columns + index]
-        else:
-            meta = meta[columns]
-
-    else:
-        meta = meta[list(columns)]
-
-    return meta, index, columns
+    return meta[list(columns)], index, columns
 
 
 def aggregate_row_groups(
diff --git a/dask/dataframe/io/tests/test_parquet.py b/dask/dataframe/io/tests/test_parquet.py
@@ -4401,3 +4401,14 @@ def test_retries_on_remote_filesystem(tmpdir):
         layer = hlg_layer(ddf2.dask, "read-parquet")
         assert layer.annotations
         assert layer.annotations["retries"] == 2
+
+
+def test_select_filtered_column(tmp_path, engine):
+
+    df = pd.DataFrame({"a": range(10), "b": ["cat"] * 10})
+    path = tmp_path / "test_select_filtered_column.parquet"
+    df.to_parquet(path, index=False)
+
+    with pytest.warns(UserWarning, match="Sorted columns detected"):
+        ddf = dd.read_parquet(path, engine=engine, filters=[("b", "==", "cat")])
+    assert_eq(df, ddf)