Efficient dataframe.convert_string support for read_parquet (#9979)

j-bennet · jrbourbeau · web-flow · commit d178233fa501 · 2023-02-24T18:24:51.000+01:00
Co-authored-by: James Bourbeau &lt;jrbourbeau@gmail.com&gt;
diff --git a/dask/dataframe/io/parquet/arrow.py b/dask/dataframe/io/parquet/arrow.py
@@ -7,8 +7,15 @@
 import pandas as pd
 import pyarrow as pa
 import pyarrow.parquet as pq
+
+try:
+    from pyarrow.parquet import filters_to_expression
+except ImportError:
+    from pyarrow.parquet import _filters_to_expression as filters_to_expression
+
 from packaging.version import parse as parse_version
 
+from dask import config
 from dask.base import tokenize
 from dask.core import flatten
 from dask.dataframe.backends import pyarrow_schema_dispatch
@@ -436,7 +443,9 @@ def read_metadata(
         )
 
         # Stage 2: Generate output `meta`
-        meta = cls._create_dd_meta(dataset_info, use_nullable_dtypes)
+        meta = cls._create_dd_meta(
+            dataset_info, use_nullable_dtypes=use_nullable_dtypes
+        )
 
         # Stage 3: Generate parts and stats
         parts, stats, common_kwargs = cls._construct_collection_plan(dataset_info)
@@ -1091,6 +1100,7 @@ def _collect_dataset_info(
             "metadata_task_size": metadata_task_size,
             "kwargs": {
                 "dataset": _dataset_kwargs,
+                "convert_string": config.get("dataframe.convert_string"),
                 **kwargs,
             },
         }
@@ -1123,11 +1133,13 @@ def _create_dd_meta(cls, dataset_info, use_nullable_dtypes=False):
 
         # Use _arrow_table_to_pandas to generate meta
         arrow_to_pandas = dataset_info["kwargs"].get("arrow_to_pandas", {}).copy()
+        convert_string = dataset_info["kwargs"].get("convert_string", False)
         meta = cls._arrow_table_to_pandas(
             schema.empty_table(),
             categories,
             arrow_to_pandas=arrow_to_pandas,
             use_nullable_dtypes=use_nullable_dtypes,
+            convert_string=convert_string,
         )
         index_names = list(meta.index.names)
         column_names = list(meta.columns)
@@ -1317,7 +1329,7 @@ def _construct_collection_plan(cls, dataset_info):
         # Get/transate filters
         ds_filters = None
         if filters is not None:
-            ds_filters = pq._filters_to_expression(filters)
+            ds_filters = filters_to_expression(filters)
 
         # Define subset of `dataset_info` required by _collect_file_parts
         dataset_info_kwargs = {
@@ -1666,7 +1678,7 @@ def _read_table(
                 use_threads=False,
                 schema=schema,
                 columns=cols,
-                filter=pq._filters_to_expression(filters) if filters else None,
+                filter=filters_to_expression(filters) if filters else None,
             )
         else:
             arrow_table = _read_table_from_path(
@@ -1699,40 +1711,76 @@ def _read_table(
         return arrow_table
 
     @classmethod
-    def _arrow_table_to_pandas(
-        cls, arrow_table: pa.Table, categories, use_nullable_dtypes=False, **kwargs
-    ) -> pd.DataFrame:
-        _kwargs = kwargs.get("arrow_to_pandas", {})
-        _kwargs.update({"use_threads": False, "ignore_metadata": False})
-
-        if use_nullable_dtypes:
-            # Determine is `pandas` or `pyarrow`-backed dtypes should be used
-            if use_nullable_dtypes == "pandas":
-                default_types_mapper = PYARROW_NULLABLE_DTYPE_MAPPING.get
+    def _determine_type_mapper(
+        cls, *, use_nullable_dtypes=False, convert_string=False, **kwargs
+    ):
+        user_mapper = kwargs.get("arrow_to_pandas", {}).get("types_mapper")
+        type_mappers = []
+
+        def pyarrow_type_mapper(pyarrow_dtype):
+            # Special case pyarrow strings to use more feature complete dtype
+            # See https://github.com/pandas-dev/pandas/issues/50074
+            if pyarrow_dtype == pa.string():
+                return pd.StringDtype("pyarrow")
             else:
-                # use_nullable_dtypes == "pyarrow"
+                return pd.ArrowDtype(pyarrow_dtype)
 
-                def default_types_mapper(pyarrow_dtype):  # type: ignore
-                    # Special case pyarrow strings to use more feature complete dtype
-                    # See https://github.com/pandas-dev/pandas/issues/50074
-                    if pyarrow_dtype == pa.string():
-                        return pd.StringDtype("pyarrow")
-                    else:
-                        return pd.ArrowDtype(pyarrow_dtype)
+        # always use the user-defined mapper first, if available
+        if user_mapper is not None:
+            type_mappers.append(user_mapper)
 
-            if "types_mapper" in _kwargs:
-                # User-provided entries take priority over default_types_mapper
-                types_mapper = _kwargs["types_mapper"]
+        # next in priority is converting strings
+        if convert_string:
+            type_mappers.append({pa.string(): pd.StringDtype("pyarrow")}.get)
 
-                def _types_mapper(pa_type):
-                    return types_mapper(pa_type) or default_types_mapper(pa_type)
+        # and then nullable types
+        if use_nullable_dtypes == "pandas":
+            type_mappers.append(PYARROW_NULLABLE_DTYPE_MAPPING.get)
+        elif use_nullable_dtypes:  # "pyarrow" or True
+            type_mappers.append(pyarrow_type_mapper)
 
-                _kwargs["types_mapper"] = _types_mapper
+        def default_types_mapper(pyarrow_dtype):
+            """Try all type mappers in order, starting from the user type mapper."""
+            for type_converter in type_mappers:
+                converted_type = type_converter(pyarrow_dtype)
+                if converted_type is not None:
+                    return converted_type
 
-            else:
-                _kwargs["types_mapper"] = default_types_mapper
+        if len(type_mappers) > 0:
+            return default_types_mapper
 
-        return arrow_table.to_pandas(categories=categories, **_kwargs)
+    @classmethod
+    def _arrow_table_to_pandas(
+        cls,
+        arrow_table: pa.Table,
+        categories,
+        use_nullable_dtypes=False,
+        convert_string=False,
+        **kwargs,
+    ) -> pd.DataFrame:
+        _kwargs = kwargs.get("arrow_to_pandas", {})
+        _kwargs.update({"use_threads": False, "ignore_metadata": False})
+
+        types_mapper = cls._determine_type_mapper(
+            use_nullable_dtypes=use_nullable_dtypes,
+            convert_string=convert_string,
+            **kwargs,
+        )
+        if types_mapper is not None:
+            _kwargs["types_mapper"] = types_mapper
+
+        res = arrow_table.to_pandas(categories=categories, **_kwargs)
+        # TODO: remove this when fixed in pyarrow: https://github.com/apache/arrow/issues/34283
+        if (
+            convert_string
+            and isinstance(res.index, pd.Index)
+            and not isinstance(res.index, pd.MultiIndex)
+            and pd.api.types.is_string_dtype(res.index.dtype)
+            and res.index.dtype
+            not in (pd.StringDtype("pyarrow"), pd.ArrowDtype(pa.string()))
+        ):
+            res.index = res.index.astype(pd.StringDtype("pyarrow"))
+        return res
 
     @classmethod
     def collect_file_metadata(cls, path, fs, file_path):
diff --git a/dask/dataframe/io/parquet/fastparquet.py b/dask/dataframe/io/parquet/fastparquet.py
@@ -20,6 +20,7 @@
 except ImportError:
     pass
 
+from dask import config
 from dask.base import tokenize
 
 #########################
@@ -891,6 +892,11 @@ def read_metadata(
             raise ValueError(
                 "`use_nullable_dtypes` is not supported by the fastparquet engine"
             )
+        if config.get("dataframe.convert_string", False):
+            warnings.warn(
+                "`dataframe.convert_string` is not supported by the fastparquet engine",
+                category=UserWarning,
+            )
 
         # Stage 1: Collect general dataset information
         dataset_info = cls._collect_dataset_info(
diff --git a/dask/dataframe/io/parquet/utils.py b/dask/dataframe/io/parquet/utils.py
@@ -194,6 +194,8 @@ def read_partition(
         use_nullable_dtypes: boolean
             Whether to use pandas nullable dtypes (like "string" or "Int64")
             where appropriate when reading parquet files.
+        convert_string: boolean
+            Whether to use pyarrow strings when reading parquet files.
         **kwargs:
             Includes `"kwargs"` values stored within the `parts` output
             of `engine.read_metadata`. May also include arguments to be
diff --git a/dask/dataframe/io/tests/test_parquet.py b/dask/dataframe/io/tests/test_parquet.py
@@ -3330,11 +3330,20 @@ def clamp_arrow_datetimes(cls, arrow_table: pa.Table) -> pa.Table:
 
         @classmethod
         def _arrow_table_to_pandas(
-            cls, arrow_table: pa.Table, categories, use_nullable_dtypes=False, **kwargs
+            cls,
+            arrow_table: pa.Table,
+            categories,
+            use_nullable_dtypes=False,
+            convert_string=False,
+            **kwargs,
         ) -> pd.DataFrame:
             fixed_arrow_table = cls.clamp_arrow_datetimes(arrow_table)
             return super()._arrow_table_to_pandas(
-                fixed_arrow_table, categories, use_nullable_dtypes, **kwargs
+                fixed_arrow_table,
+                categories,
+                use_nullable_dtypes=use_nullable_dtypes,
+                convert_string=convert_string,
+                **kwargs,
             )
 
     # this should not fail, but instead produce timestamps that are in the valid range
@@ -4556,3 +4565,80 @@ def test_select_filtered_column(tmp_path, engine):
     with pytest.warns(UserWarning, match="Sorted columns detected"):
         ddf = dd.read_parquet(path, engine=engine, filters=[("b", "==", "cat")])
     assert_eq(df, ddf)
+
+
+@PYARROW_MARK
+@pytest.mark.parametrize("convert_string", [True, False])
+@pytest.mark.skipif(not PANDAS_GT_150, reason="requires pd.ArrowDtype")
+def test_read_parquet_convert_string(tmp_path, convert_string, engine):
+    df = pd.DataFrame(
+        {"A": ["def", "abc", "ghi"], "B": [5, 2, 3], "C": ["x", "y", "z"]}
+    ).set_index("C")
+
+    outfile = tmp_path / "out.parquet"
+    df.to_parquet(outfile, engine=engine)
+
+    with dask.config.set({"dataframe.convert_string": convert_string}):
+        ddf = dd.read_parquet(outfile, engine="pyarrow")
+
+    if convert_string:
+        expected = df.astype({"A": "string[pyarrow]"})
+        expected.index = expected.index.astype("string[pyarrow]")
+    else:
+        expected = df
+    assert_eq(ddf, expected)
+    assert len(ddf.dask.layers) == 1
+
+
+@PYARROW_MARK
+@pytest.mark.skipif(not PANDAS_GT_150, reason="requires pd.ArrowDtype")
+def test_read_parquet_convert_string_nullable_mapper(tmp_path, engine):
+    """Make sure that when convert_string, use_nullable_dtypes and types_mapper are set,
+    all three are used."""
+    df = pd.DataFrame(
+        {
+            "A": pd.Series(["def", "abc", "ghi"], dtype="string"),
+            "B": pd.Series([5, 2, 3], dtype="Int64"),
+            "C": pd.Series([1.1, 6.3, 8.4], dtype="Float32"),
+            "I": pd.Series(["x", "y", "z"], dtype="string"),
+        }
+    ).set_index("I")
+
+    outfile = tmp_path / "out.parquet"
+    df.to_parquet(outfile, engine=engine)
+
+    types_mapper = {
+        pa.float32(): pd.Float64Dtype(),
+    }
+
+    with dask.config.set({"dataframe.convert_string": True}):
+        ddf = dd.read_parquet(
+            tmp_path,
+            engine="pyarrow",
+            use_nullable_dtypes="pandas",
+            arrow_to_pandas={"types_mapper": types_mapper.get},
+        )
+
+    expected = df.astype(
+        {
+            "A": "string[pyarrow]",  # bc dataframe.convert_string=True
+            "B": pd.Int64Dtype(),  # bc use_nullable_dtypes=Pandas
+            "C": pd.Float64Dtype(),  # bc user mapper
+        }
+    )
+    expected.index = expected.index.astype("string[pyarrow]")
+
+    assert_eq(ddf, expected)
+
+
+@FASTPARQUET_MARK
+def test_read_parquet_convert_string_fastparquet_warns(tmp_path):
+    df = pd.DataFrame({"A": ["def", "abc", "ghi"], "B": [5, 2, 3]})
+    outfile = tmp_path / "out.parquet"
+    df.to_parquet(outfile)
+
+    with dask.config.set({"dataframe.convert_string": True}):
+        with pytest.warns(
+            UserWarning, match="`dataframe.convert_string` is not supported"
+        ):
+            dd.read_parquet(outfile, engine="fastparquet")